Aarch64 4x8 lane ld64 GEMM/IGEMM microkernels.

- Based on 4x16 microkernel, reduced to 4x8 size.
- Update register usage comments and push/pop for 4x16 IGEMM.

PiperOrigin-RevId: 416107685
diff --git a/BUILD.bazel b/BUILD.bazel
index 12ba1f2..fc36bbc 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4282,9 +4282,9 @@
     "src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
     "src/qs8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16-add16.c",
     "src/qs8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16-add16.c",
+    "src/qs8-f32-vcvt/gen/vcvt-sse2-x32.c",
     "src/qs8-gavgpool/gen/7p7x-minmax-sse2-c8-acc2.c",
     "src/qs8-gavgpool/gen/7x-minmax-sse2-c8-acc2.c",
-    "src/qs8-f32-vcvt/gen/vcvt-sse2-x32.c",
     "src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
     "src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
     "src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
@@ -6438,6 +6438,8 @@
     "src/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mull.S",
     "src/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S",
     "src/qs8-gemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S",
+    "src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
+    "src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
     "src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S",
     "src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
     "src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
@@ -6472,6 +6474,8 @@
     "src/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal.S",
     "src/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S",
     "src/qs8-igemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S",
+    "src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
+    "src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
     "src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S",
     "src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
     "src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9baf826..4c16652 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5354,6 +5354,8 @@
   src/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mull.S
   src/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S
   src/qs8-gemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S
+  src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+  src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
   src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
   src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
   src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
@@ -5388,6 +5390,8 @@
   src/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal.S
   src/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S
   src/qs8-igemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S
+  src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+  src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
   src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
   src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
   src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
diff --git a/bench/qs8-gemm-e2e.cc b/bench/qs8-gemm-e2e.cc
index a86b834..a88f1f2 100644
--- a/bench/qs8-gemm-e2e.cc
+++ b/bench/qs8-gemm-e2e.cc
@@ -116,6 +116,26 @@
       4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
       benchmark::utils::CheckNEONDOT);
   }
+  static void qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
       xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
@@ -201,6 +221,8 @@
   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld32)
   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld64)
   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld128)
+  BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64)
+  BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)
   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index c774fe9..0eb4989 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -210,7 +210,6 @@
 }
 #endif  // BENCHMARK_RUY
 
-
 #if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
   static void qs8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, 4, 16, 4, 1,
@@ -236,6 +235,14 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, 4, 16, 4, 1,
       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
   }
+  static void qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, 4, 8, 1, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
   static void qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, 4, 16, 1, 1,
       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
@@ -299,6 +306,8 @@
   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld64)
   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld128)
   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
+  BENCHMARK_GEMM(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)
+  BENCHMARK_GEMM(qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64)
   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh
index 133ac1d..bc0c023 100755
--- a/scripts/generate-qs8-gemm.sh
+++ b/scripts/generate-qs8-gemm.sh
@@ -618,6 +618,9 @@
 
 ############################### AArch64 assembly ##############################
 ### Cortex-A53 lane micro-kernels
+tools/xngen src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=0 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=1 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S &
+
 tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
 tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
 
diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh
index 44bb33a..4f9ee2d 100755
--- a/scripts/generate-qs8-igemm.sh
+++ b/scripts/generate-qs8-igemm.sh
@@ -606,6 +606,9 @@
 
 ############################### AArch64 assembly ##############################
 ### Cortex-A53 lane micro-kernels
+tools/xngen src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=0 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=1 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S &
+
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
 
diff --git a/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
index f9848c0..a3fce54 100644
--- a/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -29,15 +29,13 @@
 # A1 x15 v1
 # A2 x13 v2
 # A3  x4 v3
-# B   x5 v4  v5  v6
+# B   x5 v4  v5
 # C0  x6 v16 v20 v24 v28
 # C1  x8 v17 v21 v25 v29
 # C2  x9 v18 v22 v26 v30
 # C3  x7 v19 v23 v27 v31
 # unused v7 v8 v9 v10 v11 v12 v13 v14 v15
 
-# x10 x17 a53 temp registers
-
 BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
 
         # Clamp A and C pointers
diff --git a/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
index 1c45866..81712d0 100644
--- a/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -29,15 +29,13 @@
 # A1 x15 v1
 # A2 x13 v2
 # A3  x4 v3
-# B   x5 v4  v5  v6
+# B   x5 v4  v5
 # C0  x6 v16 v20 v24 v28
 # C1  x8 v17 v21 v25 v29
 # C2  x9 v18 v22 v26 v30
 # C3  x7 v19 v23 v27 v31
 # unused v7 v8 v9 v10 v11 v12 v13 v14 v15
 
-# x10 x17 a53 temp registers
-
 BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
 
         # Clamp A and C pointers
diff --git a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
index ea14f76..26971a6 100644
--- a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -31,13 +31,12 @@
 # A1  x14  v1
 # A2  x15  v2
 # A3  x20  v3
-# B    x5  v4  v5  v6
+# B    x5  v4  v5
 # C0   x6 v16 v20 v24 v28
 # C1  x16 v17 v21 v25 v29
 # C2  x17 v18 v22 v26 v30
 # C3   x7 v19 v23 v27 v31
 # unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
 
@@ -53,7 +52,7 @@
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
         CMP     x0, 4                   // if mr < 4
-        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        STR     x20, [sp, -16]!         // Save x20 on stack
         ADD     x7,  x17, x7            // c3 = c2 + cm_stride
         CSEL    x7,  x17, x7, LO        //   c3 = c2
 
@@ -401,8 +400,8 @@
         # nc loop
         B.HI    0b
 
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
         # Remainder- 1 to 7 bytes of A
@@ -607,8 +606,8 @@
         STR     b1, [x16]
         STR     b0, [x6]
 9:
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
 END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
diff --git a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
index 908e363..fd683a7 100644
--- a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -31,13 +31,12 @@
 # A1  x14  v1
 # A2  x15  v2
 # A3  x20  v3
-# B    x5  v4  v5  v6
+# B    x5  v4  v5
 # C0   x6 v16 v20 v24 v28
 # C1  x16 v17 v21 v25 v29
 # C2  x17 v18 v22 v26 v30
 # C3   x7 v19 v23 v27 v31
 # unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
 
@@ -53,7 +52,7 @@
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
         CMP     x0, 4                   // if mr < 4
-        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        STR     x20, [sp, -16]!         // Save x20 on stack
         ADD     x7,  x17, x7            // c3 = c2 + cm_stride
         CSEL    x7,  x17, x7, LO        //   c3 = c2
 
@@ -407,8 +406,8 @@
         # nc loop
         B.HI    0b
 
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
         # Remainder- 1 to 7 bytes of A
@@ -613,8 +612,8 @@
         STR     b1, [x16]
         STR     b0, [x6]
 9:
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
 END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
diff --git a/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in b/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
index 8490502..612fb18 100644
--- a/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+++ b/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
@@ -53,7 +53,7 @@
 # A1 x15 v1
 # A2 x13 v2
 # A3  x4 v3
-# B   x5 v4  v5  v6
+# B   x5 v4  v5
 # C0  x6 v16 v20 v24 v28
 # C1  x8 v17 v21 v25 v29
 # C2  x9 v18 v22 v26 v30
@@ -64,8 +64,6 @@
 $else:
   # unused v7 v8 v9 v10 v11 v12 v13 v14 v15
 
-# x10 x17 a53 temp registers
-
 BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
 
         # Clamp A and C pointers
diff --git a/src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in b/src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
new file mode 100644
index 0000000..b53b96c
--- /dev/null
+++ b/src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
@@ -0,0 +1,528 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert REQUANTIZATION in ["FP32", "RNDNU"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
+$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
+$assert DATATYPE != "QU8" or REQUANTIZATION == "RNDNU"
+
+#include <xnnpack/assembly.h>
+
+$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+$if DATATYPE == "QU8":
+  $REWIND_DECREMENT = 15
+$else:
+  $REWIND_DECREMENT = 3 if CHANNELWISE else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
+$XMIN = "UMIN" if DATATYPE == "QU8" else "SMIN"
+$XMAX = "UMAX" if DATATYPE == "QU8" else "SMAX"
+$XXTL = "UXTL" if DATATYPE == "QU8" else "SXTL"
+$SQXTXN = "SQXTUN" if DATATYPE == "QU8" else "SQXTN"
+$SQXTXN2 = "SQXTUN2" if DATATYPE == "QU8" else "SQXTN2"
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
+# void xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const ${XINT8_T}* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     ${XINT8_T}* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+
+$if REQUANTIZATION == "RNDNU" and DATATYPE == "QU8":
+  # params structure is 20 bytes
+  #  struct {
+  #    ${XINT8_T} kernel_zero_point[4];
+  #    int32_t right_pre_shift;
+  #    int32_t multiplier;
+  #    int32_t right_post_shift;
+  #    int16_t output_zero_point;
+  #    ${XINT8_T} output_min;
+  #    ${XINT8_T} output_max;
+  #  } rndnu_neon;
+  #
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v5
+# C0  x6 v24 v28
+# C1  x8 v25 v29
+# C2  x9 v26 v30
+# C3  x7 v27 v31
+$if DATATYPE == "QU8":
+  # zero_point  v7
+  # unused v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+$else:
+  # unused v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+
+BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x12, x11, [sp]          // Load cn_stride, params
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+        $if DATATYPE == "QU8":
+          LD1R    {v7.4s}, [x11], 4        // kernel_zero_point
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q24, q28, [x5], 32
+        SUBS    x0, x2, 8               // k = kc - 8
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        # Is there at least 8 bytes for main loop?
+        B.LO    3f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LD1     {v0.8b},  [x3], 8
+        LDR     d5, [x5], 8
+        LD1     {v1.8b}, [x15], 8
+        LD1     {v2.8b}, [x13], 8
+        LD1     {v3.8b},  [x4], 8
+        ${XXTL}    v0.8h, v0.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        ${XXTL}    v1.8h, v1.8b
+        ${XXTL}    v2.8h, v2.8b
+        ${XXTL}    v3.8h, v3.8b
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x13, 128]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x15, 128]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x3, 128]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x4, 128]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x5, 448]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x5, 512]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    1b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 3f
+
+2:
+        $if REQUANTIZATION == "RNDNU":
+          # Apply params - preshift, scale, postshift, bias and clamp
+          LD1R    {v4.4s}, [x11], 4
+          SQSHL   v24.4s, v24.4s, v4.4s   // shift to upper bits
+          SQSHL   v25.4s, v25.4s, v4.4s
+          SQSHL   v26.4s, v26.4s, v4.4s
+          SQSHL   v27.4s, v27.4s, v4.4s
+          LD1R    {v5.4s}, [x11], 4
+          SQSHL   v28.4s, v28.4s, v4.4s
+          SQSHL   v29.4s, v29.4s, v4.4s
+          SQSHL   v30.4s, v30.4s, v4.4s
+          SQSHL   v31.4s, v31.4s, v4.4s
+          LD1R    {v6.4s}, [x11], 4
+          SQDMULH v24.4s, v24.4s, v5.4s   // scale without rounding
+          SQDMULH v25.4s, v25.4s, v5.4s
+          SQDMULH v26.4s, v26.4s, v5.4s
+          SQDMULH v27.4s, v27.4s, v5.4s
+          SQDMULH v28.4s, v28.4s, v5.4s
+          SQDMULH v29.4s, v29.4s, v5.4s
+          SQDMULH v30.4s, v30.4s, v5.4s
+          SQDMULH v31.4s, v31.4s, v5.4s
+          SRSHL   v24.4s, v24.4s, v6.4s   // signed rounding shift left
+          SRSHL   v25.4s, v25.4s, v6.4s
+          SRSHL   v26.4s, v26.4s, v6.4s
+          SRSHL   v27.4s, v27.4s, v6.4s
+          SRSHL   v28.4s, v28.4s, v6.4s
+          SRSHL   v29.4s, v29.4s, v6.4s
+          SRSHL   v30.4s, v30.4s, v6.4s
+          SRSHL   v31.4s, v31.4s, v6.4s
+        $elif REQUANTIZATION == "FP32":
+          SCVTF   v24.4s, v24.4s
+          SCVTF   v25.4s, v25.4s
+          $if not CHANNELWISE:
+            # Apply params - scale, bias and clamp
+            LD1R    {v4.4s}, [x11], 4
+            SCVTF   v26.4s, v26.4s
+            SCVTF   v27.4s, v27.4s
+          $else:
+            # Load per channel scale values from weights
+            LDR     q4, [x5], 16
+            SCVTF   v26.4s, v26.4s
+            SCVTF   v27.4s, v27.4s
+            LDR     q5, [x5], 16
+          SCVTF   v28.4s, v28.4s
+          SCVTF   v29.4s, v29.4s
+          SCVTF   v30.4s, v30.4s
+          SCVTF   v31.4s, v31.4s
+
+          $if CHANNELWISE:
+            LDR     q6, [x5], 16
+            FMUL    v24.4s, v24.4s, v6.4s
+            FMUL    v25.4s, v25.4s, v6.4s
+            FMUL    v26.4s, v26.4s, v6.4s
+            FMUL    v27.4s, v27.4s, v6.4s
+            LDR     q4, [x5], 16
+            FMUL    v28.4s, v28.4s, v4.4s
+            FMUL    v29.4s, v29.4s, v4.4s
+            FMUL    v30.4s, v30.4s, v4.4s
+            FMUL    v31.4s, v31.4s, v4.4s
+          $else:
+            FMUL    v24.4s, v24.4s, v4.4s
+            FMUL    v25.4s, v25.4s, v4.4s
+            FMUL    v26.4s, v26.4s, v4.4s
+            FMUL    v27.4s, v27.4s, v4.4s
+            FMUL    v28.4s, v28.4s, v4.4s
+            FMUL    v29.4s, v29.4s, v4.4s
+            FMUL    v30.4s, v30.4s, v4.4s
+            FMUL    v31.4s, v31.4s, v4.4s
+
+          FCVTNS  v23.4s, v23.4s
+          FCVTNS  v24.4s, v24.4s
+          FCVTNS  v25.4s, v25.4s
+          FCVTNS  v26.4s, v26.4s
+          FCVTNS  v27.4s, v27.4s
+          FCVTNS  v28.4s, v28.4s
+          FCVTNS  v29.4s, v29.4s
+          FCVTNS  v30.4s, v30.4s
+          FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+        LD1R    {v4.8b}, [x11], 1       // clamp min value
+
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v5.8b}, [x11]          // clamp max value
+        ${SQXTXN}  v0.8b, v24.8h
+        ${SQXTXN}  v1.8b, v25.8h
+        ${SQXTXN}  v2.8b, v26.8h
+        ${SQXTXN}  v3.8b, v27.8h
+        SUB     x11, x11, ${REWIND_DECREMENT}             // rewind params pointer
+
+        ${XMAX}    v0.8b, v0.8b, v4.8b
+        ${XMAX}    v1.8b, v1.8b, v4.8b
+        ${XMAX}    v2.8b, v2.8b, v4.8b
+        ${XMAX}    v3.8b, v3.8b, v4.8b
+        SUBS    x1, x1, 8
+        ${XMIN}    v0.8b, v0.8b, v5.8b
+        ${XMIN}    v1.8b, v1.8b, v5.8b
+        ${XMIN}    v2.8b, v2.8b, v5.8b
+        ${XMIN}    v3.8b, v3.8b, v5.8b
+        B.LO    4f
+
+        # Store full 4 x 8
+        ST1     {v0.8b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v1.8b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v2.8b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v3.8b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+3:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b},  [x3], x0
+        LDR     d5, [x5], 8
+        LD1     {v1.8b}, [x15], x0
+        LD1     {v2.8b}, [x13], x0
+        LD1     {v3.8b},  [x4], x0
+        ${XXTL}    v0.8h, v0.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        ${XXTL}    v1.8h, v1.8b
+        ${XXTL}    v2.8h, v2.8b
+        ${XXTL}    v3.8h, v3.8b
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    2b
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    2b
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    2b
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    2b
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    2b
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    2b
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       2b
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 2, 5f
+        STR     s0, [x6], 4
+        STR     s1, [x8], 4
+        DUP     s0, v0.s[1]
+        DUP     s1, v1.s[1]
+        STR     s2, [x9], 4
+        STR     s3, [x7], 4
+        DUP     s2, v2.s[1]
+        DUP     s3, v3.s[1]
+5:
+        TBZ     x1, 1, 6f
+        STR     h0, [x6], 2
+        STR     h1, [x8], 2
+        DUP     h0, v0.h[1]
+        DUP     h1, v1.h[1]
+        STR     h2, [x9], 2
+        STR     h3, [x7], 2
+        DUP     h2, v2.h[1]
+        DUP     h3, v3.h[1]
+6:
+        TBZ     x1, 0, 7f
+        STR     b0, [x6]
+        STR     b1, [x8]
+        STR     b2, [x9]
+        STR     b3, [x7]
+7:
+        RET
+
+END_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
index d552539..1a517b6 100644
--- a/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -29,15 +29,13 @@
 # A1 x15 v1
 # A2 x13 v2
 # A3  x4 v3
-# B   x5 v4  v5  v6
+# B   x5 v4  v5
 # C0  x6 v16 v20 v24 v28
 # C1  x8 v17 v21 v25 v29
 # C2  x9 v18 v22 v26 v30
 # C3  x7 v19 v23 v27 v31
 # unused v7 v8 v9 v10 v11 v12 v13 v14 v15
 
-# x10 x17 a53 temp registers
-
 BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
 
         # Clamp A and C pointers
diff --git a/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
index 6f95707..f3567b9 100644
--- a/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -29,15 +29,13 @@
 # A1 x15 v1
 # A2 x13 v2
 # A3  x4 v3
-# B   x5 v4  v5  v6
+# B   x5 v4  v5
 # C0  x6 v16 v20 v24 v28
 # C1  x8 v17 v21 v25 v29
 # C2  x9 v18 v22 v26 v30
 # C3  x7 v19 v23 v27 v31
 # unused v7 v8 v9 v10 v11 v12 v13 v14 v15
 
-# x10 x17 a53 temp registers
-
 BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
 
         # Clamp A and C pointers
diff --git a/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
index 980b41b..e3b2332 100644
--- a/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -29,15 +29,13 @@
 # A1 x15 v1
 # A2 x13 v2
 # A3  x4 v3
-# B   x5 v4  v5  v6
+# B   x5 v4  v5
 # C0  x6 v16 v20 v24 v28
 # C1  x8 v17 v21 v25 v29
 # C2  x9 v18 v22 v26 v30
 # C3  x7 v19 v23 v27 v31
 # unused v7 v8 v9 v10 v11 v12 v13 v14 v15
 
-# x10 x17 a53 temp registers
-
 BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
 
         # Clamp A and C pointers
diff --git a/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
index dd64d8e..7fbc9fa 100644
--- a/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -29,15 +29,13 @@
 # A1 x15 v1
 # A2 x13 v2
 # A3  x4 v3
-# B   x5 v4  v5  v6
+# B   x5 v4  v5
 # C0  x6 v16 v20 v24 v28
 # C1  x8 v17 v21 v25 v29
 # C2  x9 v18 v22 v26 v30
 # C3  x7 v19 v23 v27 v31
 # unused v7 v8 v9 v10 v11 v12 v13 v14 v15
 
-# x10 x17 a53 temp registers
-
 BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
 
         # Clamp A and C pointers
diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..0c9824a
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
@@ -0,0 +1,746 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x8-aarch32-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(
+//     size_t mr,                            r0
+//     size_t nc,                            r1
+//     size_t kc,                            r2 -> r5
+//     const uint8_t*restrict a,             r3
+//     size_t a_stride,          sp + 96  -> (r7)
+//     const void*restrict w,    sp + 100 -> r9
+//     uint8_t*restrict c,       sp + 104 -> r11
+//     size_t cm_stride,         sp + 108 -> (r6)
+//     size_t cn_stride,         sp + 112 -> r7
+//     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  sp + 116 -> (r7)
+
+
+// inner loop registers
+
+// A0  r7  d0
+// A1  r6  d1
+// A2  r2  d2
+// A3  r3  d3
+
+// B    r9  d8,  d9, d10, d11
+// B       d12, d13, d14, d15
+
+// C3  ip r12  [240]q9  q5
+// C2  sl r10       q15 q7
+// C1  fp r11       q2  q4
+// C0  r0           q6  q14
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64
+        .arm
+#ifndef __APPLE__
+        .arch   armv7-a
+        .fpu    neon
+#endif
+        # Push 104 bytes.  r2 is for kc reset
+        VPUSH   {d8-d15}      // 64 bytes
+        PUSH    {r2, r4, r5, r6, r7, r8, r9, sl, fp, lr}   // 40 bytes
+        SUB     sp, sp, #456  // +456 = 560 bytes.  TODO eliminate
+        MOV     lr, r1
+        LDR     r1, [sp, #560]
+        CMP     r0, #2
+        MOV     r4, r3
+        ADDCS   r4, r4, r1
+        CMP     r0, #3
+        LDR     r9, [sp, #580]
+        MOV     r8, #15
+        MOV     r6, r4
+        LDR     ip, [sp, #568]
+        ADDCS   r6, r6, r1
+        CMP     r0, #4
+        LDR     r5, [sp, #572]
+        MOV     r7, r6
+        MOV     sl, ip
+        ADDEQ   r7, r7, r1
+        MOV     r1, r9
+        VLD1.32 {d16-d17}, [r1], r8
+        CMP     r0, #2
+        ADDCS   sl, sl, r5
+        CMP     r0, #3
+        VLD1.8  {d18-d19}, [r1]
+        ADD     r1, r9, #4
+        MOV     fp, sl
+        VLD1.32 {d20-d21}, [r1]
+        ADD     r1, r9, #8
+        ADDCS   fp, fp, r5
+        CMP     r0, #4
+        VLD1.64 {d22-d23}, [r1]
+        ADD     r1, r9, #12
+        MOV     r0, fp
+        VLD1.32 {d24-d25}, [r1]
+        ADD     r1, r9, #14
+        ADDEQ   r0, r0, r5
+        MOV     r9, #32
+        VLD1.16 {d26-d27}, [r1]
+        MOV     r1, r2
+        MOV     r2, r4
+        ADD     r4, sp, #128
+        VDUP.32 q0, d16[0]
+        LDR     r5, [sp, #576]
+        LDR     r8, [sp, #564]  // w
+        VDUP.8  q8, d26[0]
+        STR     r1, [sp, #56]
+        VDUP.8  q1, d18[0]
+        VSTMIA  r4, {d16-d17}
+        ADD     r4, sp, #112
+        VDUP.16 q8, d24[0]
+        VSTMIA  r4, {d16-d17}
+        ADD     r4, sp, #96
+        VDUP.32 q8, d22[0]
+        VSTMIA  r4, {d16-d17}
+        ADD     r4, sp, #80
+        VDUP.32 q8, d20[0]
+        VSTMIA  r4, {d16-d17}
+        ADD     r4, sp, #32
+        VSTMIA  r4, {d0-d1}
+        ADD     r4, sp, #16
+        VSTMIA  r4, {d2-d3}
+0:
+        # Load initial bias from w into accumulators
+        ADD     r4, r8, #16
+        VLD1.8  {d16-d17}, [r8], r9   // Bias
+        CMP     r1, #8
+        STR     lr, [sp, #12]
+        ADD     lr, sp, #240
+        VLD1.8  {d10-d11}, [r4]
+        VSTMIA  lr, {d16-d17}
+        LDR     lr, [sp, #12]
+        BCC     2f                     // less than 8 channels?  skip main loop
+        STR     lr, [sp, #64]
+        ADD     lr, sp, #240
+        VORR    q7, q5, q5
+        STR     ip, [sp, #68]
+        VLDMIA  lr, {d6-d7}
+        VORR    q14, q5, q5
+        VORR    q6, q5, q5
+        MOV     ip, #0
+        VORR    q15, q3, q3
+        MOV     r4, r1
+        VORR    q4, q3, q3
+        STR     r0, [sp, #72]
+        STR     fp, [sp, #76]
+        STR     sl, [sp, #60]
+        STR     r7, [sp, #156]
+
+        # Main loop - 8 bytes of A
+
+1:
+        MOV     r9, r4
+        MOV     r4, r7
+        LDR     lr, [r8, #4]
+        MOV     r7, r6
+        MOV     r6, r2
+        LDR     sl, [r4, ip]!
+        LDR     fp, [r8]
+        LDR     r2, [r8, #8]
+        LDR     r1, [r8, #12]
+        STR     lr, [sp, #364]
+        ADD     lr, sp, #192
+        LDR     r0, [r8, #24]
+        STR     fp, [sp, #360]
+        LDR     r5, [r8, #20]
+        STR     r2, [sp, #384]
+        ADD     r2, sp, #360
+        STR     r1, [sp, #388]
+        VLD1.8  {d16}, [r2 :64]
+        ADD     r2, sp, #384
+        VLD1.8  {d17}, [r2 :64]
+        VMOVL.S8 q10, d16
+        LDR     r1, [r8, #28]
+        LDR     r2, [r8, #16]
+        STR     sl, [sp, #416]
+        STR     r1, [sp, #380]
+        STR     r0, [sp, #376]
+        STR     r5, [sp, #372]
+        STR     r2, [sp, #368]
+        LDR     r0, [r4, #4]
+        MOV     r4, r9
+        STR     r0, [sp, #420]
+        ADD     r0, sp, #416
+        SUB     r4, r9, #8
+        VLD1.8  {d18}, [r0 :64]
+        ADD     r0, sp, #376
+        CMP     r4, #7
+        VMOVL.S8 q0, d18
+        VLD1.8  {d16}, [r0 :64]
+        VMOVL.S8 q9, d17
+        ADD     r0, sp, #368
+        VMLAL.S16 q6, d21, d0[0]
+        VLD1.8  {d17}, [r0 :64]
+        VORR    q11, q9, q9
+        LDR     r0, [r8, #32]
+        VMLAL.S16 q3, d20, d0[0]
+        LDR     r1, [r8, #36]
+        VMLAL.S16 q6, d19, d0[1]
+        LDR     r2, [r8, #40]
+        VMOVL.S8 q9, d17
+        LDR     r5, [r8, #44]
+        STR     r0, [sp, #424]
+        ADD     r0, sp, #424
+        STR     r1, [sp, #428]
+        VMLAL.S16 q6, d19, d0[2]
+        VORR    q1, q9, q9
+        VMOVL.S8 q9, d16
+        VMLAL.S16 q6, d19, d0[3]
+        VSTMIA  lr, {d18-d19}
+        ADD     lr, sp, #176
+        VORR    q9, q11, q11
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #432
+        VMLAL.S16 q3, d18, d0[1]
+        VMOVL.S8 q8, d16
+        STR     r2, [sp, #432]
+        STR     r5, [sp, #436]
+        MOV     r2, r6
+        MOV     r6, r7
+        LDR     r7, [sp, #156]
+        VMLAL.S16 q6, d17, d1[0]
+        LDR     r1, [r8, #48]
+        VSTMIA  lr, {d16-d17}
+        ADD     lr, sp, #208
+        VLD1.8  {d16}, [r0 :64]
+        LDR     r0, [r8, #52]
+        VMOVL.S8 q8, d16
+        STR     r0, [sp, #444]
+        ADD     r0, sp, #440
+        STR     r1, [sp, #440]
+        VLD1.8  {d24}, [r0 :64]
+        VMLAL.S16 q6, d17, d1[1]
+        VORR    q2, q8, q8
+        LDR     r0, [r8, #60]
+        VMOVL.S8 q8, d24
+        LDR     r1, [r8, #56]
+        ADD     r8, r8, #64
+        STR     r0, [sp, #452]
+        ADD     r0, sp, #448
+        STR     r1, [sp, #448]
+        VMLAL.S16 q6, d17, d1[2]
+        VLD1.8  {d26}, [r0 :64]
+        VORR    q12, q8, q8
+        MOV     r0, r6
+        VMOVL.S8 q13, d26
+        LDR     r1, [r0, ip]!
+        VMLAL.S16 q6, d27, d1[3]
+        VSTMIA  lr, {d12-d13}
+        VORR    q6, q10, q10
+        VORR    q10, q1, q1
+        ADD     lr, sp, #256
+        VORR    q1, q12, q12
+        VSTMIA  lr, {d22-d23}
+        ADD     lr, sp, #192
+        VMLAL.S16 q3, d20, d0[2]
+        VLDMIA  lr, {d22-d23}
+        ADD     lr, sp, #176
+        VLDMIA  lr, {d16-d17}
+        ADD     lr, sp, #160
+        VMLAL.S16 q3, d22, d0[3]
+        STR     r1, [sp, #408]
+        LDR     r0, [r0, #4]
+        STR     r0, [sp, #412]
+        ADD     r0, sp, #408
+        VMLAL.S16 q3, d16, d1[0]
+        VMLAL.S16 q3, d4, d1[1]
+        VMLAL.S16 q3, d24, d1[2]
+        VORR    q12, q8, q8
+        VMLAL.S16 q3, d26, d1[3]
+        VLD1.8  {d0}, [r0 :64]
+        MOV     r0, r2
+        VMOVL.S8 q0, d0
+        VSTMIA  lr, {d4-d5}
+        ADD     lr, sp, #256
+        LDR     r1, [r0, ip]!
+        VMLAL.S16 q14, d13, d0[0]
+        VMLAL.S16 q4, d12, d0[0]
+        VMLAL.S16 q14, d19, d0[1]
+        VORR    q9, q10, q10
+        VMLAL.S16 q14, d21, d0[2]
+        VORR    q10, q6, q6
+        VMLAL.S16 q14, d23, d0[3]
+        VMLAL.S16 q14, d17, d1[0]
+        VORR    q8, q2, q2
+        VLDMIA  lr, {d4-d5}
+        ADD     lr, sp, #160
+        VMLAL.S16 q4, d4, d0[1]
+        STR     r1, [sp, #400]
+        VMLAL.S16 q14, d17, d1[1]
+        LDR     r0, [r0, #4]
+        STR     r0, [sp, #404]
+        ADD     r0, sp, #400
+        VMLAL.S16 q4, d18, d0[2]
+        VMLAL.S16 q14, d3, d1[2]
+        VMLAL.S16 q4, d22, d0[3]
+        VMLAL.S16 q14, d27, d1[3]
+        VMLAL.S16 q4, d24, d1[0]
+        VMLAL.S16 q4, d16, d1[1]
+        VLDMIA  lr, {d16-d17}
+        ADD     lr, sp, #224
+        VMLAL.S16 q4, d2, d1[2]
+        VMLAL.S16 q4, d26, d1[3]
+        VLD1.8  {d0}, [r0 :64]
+        MOV     r0, r3
+        VMOVL.S8 q0, d0
+        LDR     r1, [r0, ip]!
+        ADD     ip, ip, #8
+        VMLAL.S16 q7, d13, d0[0]
+        VMLAL.S16 q15, d20, d0[0]
+        VMLAL.S16 q7, d5, d0[1]
+        VORR    q2, q9, q9
+        VMLAL.S16 q7, d19, d0[2]
+        VORR    q9, q1, q1
+        VMLAL.S16 q7, d23, d0[3]
+        VMLAL.S16 q7, d25, d1[0]
+        VMLAL.S16 q7, d17, d1[1]
+        VMLAL.S16 q7, d3, d1[2]
+        VMLAL.S16 q7, d27, d1[3]
+        VSTMIA  lr, {d14-d15}
+        ADD     lr, sp, #240
+        STR     r1, [sp, #392]
+        LDR     r0, [r0, #4]
+        STR     r0, [sp, #396]
+        ADD     r0, sp, #392
+        VLDMIA  lr, {d12-d13}
+        ADD     lr, sp, #256
+        VLD1.8  {d2}, [r0 :64]
+        VMOVL.S8 q1, d2
+        VLDMIA  lr, {d14-d15}
+        ADD     lr, sp, #224
+        VMLAL.S16 q15, d14, d0[1]
+        VMLAL.S16 q6, d20, d2[0]
+        VMLAL.S16 q5, d21, d2[0]
+        VMLAL.S16 q15, d4, d0[2]
+        VMLAL.S16 q6, d14, d2[1]
+        VMLAL.S16 q5, d15, d2[1]
+        VLDMIA  lr, {d14-d15}
+        ADD     lr, sp, #240
+        VMLAL.S16 q15, d22, d0[3]
+        VMLAL.S16 q6, d4, d2[2]
+        VMLAL.S16 q5, d5, d2[2]
+        VMLAL.S16 q15, d24, d1[0]
+        VMLAL.S16 q6, d22, d2[3]
+        VMLAL.S16 q5, d23, d2[3]
+        VMLAL.S16 q15, d16, d1[1]
+        VMLAL.S16 q6, d24, d3[0]
+        VMLAL.S16 q5, d25, d3[0]
+        VMLAL.S16 q15, d18, d1[2]
+        VMLAL.S16 q6, d16, d3[1]
+        VMLAL.S16 q5, d17, d3[1]
+        VMLAL.S16 q15, d26, d1[3]
+        VMLAL.S16 q6, d18, d3[2]
+        VMLAL.S16 q5, d19, d3[2]
+        VMLAL.S16 q6, d26, d3[3]
+        VMLAL.S16 q5, d27, d3[3]
+        VSTMIA  lr, {d12-d13}
+        ADD     lr, sp, #208
+        VLDMIA  lr, {d12-d13}
+        BHI     1b
+        ADD     r5, sp, #32
+        ADD     lr, sp, #56
+        VORR    q2, q4, q4
+        ADD     r7, r7, ip
+        VLDMIA  r5, {d0-d1}
+        ADD     r5, sp, #16
+        VORR    q4, q14, q14
+        ADD     r6, r6, ip
+        VLDMIA  r5, {d2-d3}
+        ADD     r2, r2, ip
+        ADD     r3, r3, ip
+        VORR    q14, q3, q3
+        LDR     ip, [sp, #68]
+        MOV     r9, #32
+        LDR     fp, [sp, #76]
+        LDR     r0, [sp, #72]
+        LDR     r5, [sp, #576]
+        LDM     lr, {r1, sl, lr}
+        B       3f
+2:
+        STR     lr, [sp, #12]
+        ADD     lr, sp, #240
+        VORR    q15, q5, q5
+        MOV     r4, r1
+        VLDMIA  lr, {d16-d17}
+        VORR    q6, q5, q5
+        VORR    q4, q5, q5
+        LDR     lr, [sp, #12]
+        VORR    q14, q8, q8
+        VORR    q2, q8, q8
+        VORR    q7, q5, q5
+        VORR    q15, q8, q8
+3:
+        CMP     r4, #0
+        BNE     5f
+
+        # rndnu quantization
+        # C3 [240]q9  q5
+        # C2      q15 q7
+        # C1      q2  q4
+        # C0      q6  q14
+
+4:
+        ADD     r4, sp, #80
+        VSHL.S32 q11, q2, q0
+        CMP     lr, #7
+        VLDMIA  r4, {d4-d5}
+
+        VSHL.S32 q13, q15, q0
+        VLDR    q15, [sp, 240]           // q15 spilled
+
+        VSHL.S32 q8, q6, q0
+        VSHL.S32 q9, q14, q0
+        VSHL.S32 q10, q4, q0
+        VSHL.S32 q12, q7, q0
+        VSHL.S32 q14, q5, q0
+        VSHL.S32 q15, q15, q0
+
+        VQDMULH.S32 q8, q8, q2
+        VQDMULH.S32 q9, q9, q2
+        VQDMULH.S32 q10, q10, q2
+        VQDMULH.S32 q12, q12, q2
+        VQDMULH.S32 q11, q11, q2
+        VQDMULH.S32 q13, q13, q2
+        VQDMULH.S32 q14, q14, q2
+        VQDMULH.S32 q15, q15, q2
+        VLDMIA  r4, {d4-d5}
+        VRSHL.S32 q8, q8, q2
+        VRSHL.S32 q9, q9, q2
+        VRSHL.S32 q10, q10, q2
+        VRSHL.S32 q12, q12, q2
+        VRSHL.S32 q11, q11, q2
+        VRSHL.S32 q13, q13, q2
+        VRSHL.S32 q14, q14, q2
+        VRSHL.S32 q15, q15, q2
+        VQMOVN.S32 d17, q8
+        VQMOVN.S32 d16, q9
+        VQMOVN.S32 d19, q10
+        VQMOVN.S32 d21, q12
+        VLDMIA  r4, {d24-d25}
+        VQMOVN.S32 d18, q11
+        VQMOVN.S32 d20, q13
+        VQMOVN.S32 d23, q14
+        VQMOVN.S32 d22, q15
+        VQADD.S16 q8, q8, q12
+        VQADD.S16 q9, q9, q12
+        VQADD.S16 q10, q10, q12
+        VQADD.S16 q11, q11, q12
+        VQMOVN.S16 d17, q8
+        VQMOVN.S16 d16, q9
+        VQMOVN.S16 d19, q10
+        VQMOVN.S16 d18, q11
+        VLDMIA  r4, {d20-d21}
+        VMAX.S8 q8, q8, q10
+        VMAX.S8 q10, q9, q10
+        SUBS    lr, lr, #8
+        VMIN.S8 q9, q8, q1
+        VMIN.S8 q11, q10, q1
+        BLS     9f
+
+        # Store full 4 x 8
+        VST1.8  {d22}, [ip], r5
+        SUB     r7, r7, r1
+        VST1.8  {d23}, [sl], r5
+        SUB     r6, r6, r1
+        VST1.8  {d18}, [fp], r5
+        SUB     r2, r2, r1
+        VST1.8  {d19}, [r0], r5
+        SUB     r3, r3, r1
+        BNE     0b
+
+        ADD     sp, sp, #460  // skip over r2.
+        POP     {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+        VPOP    {d8-d15}
+        BX      lr
+
+5:
+        STR     r0, [sp, #72]
+        ADD     r0, r8, #8
+        STR     r0, [sp, #224]
+        MOV     r5, r7
+        MOV     r7, r6
+        LDR     r0, [r8]
+        STR     r0, [sp, #256]
+        MOV     r6, r2
+        LDR     r2, [r7]
+        MOV     r9, r3
+        LDR     r1, [r7, #4]
+        CMP     r4, #1
+        LDR     r0, [r5, #4]
+        STR     fp, [sp, #76]
+        LDR     fp, [r8, #4]
+        LDR     r3, [r5]
+        STR     r1, [sp, #340]
+        STR     r2, [sp, #336]
+        STR     r0, [sp, #348]
+        LDR     r0, [r6]
+        LDR     r2, [r6, #4]
+        STR     r3, [sp, #344]
+        MOV     r3, r9
+        STR     fp, [sp, #356]
+        ADD     r3, r4, r9
+        LDR     r1, [sp, #256]
+        STR     r1, [sp, #352]
+        STR     r0, [sp, #328]
+        ADD     r0, sp, #336
+        STR     r2, [sp, #332]
+        MOV     r2, r6
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #344
+        MOV     r6, r7
+        MOV     r7, r5
+        VLD1.8  {d17}, [r0 :64]
+        ADD     r0, sp, #352
+        VMOVL.S8 q10, d16
+        ADD     r7, r4, r5
+        VLD1.8  {d18}, [r0 :64]
+        ADD     r0, sp, #328
+        VMOVL.S8 q3, d17
+        ADD     r6, r4, r6
+        VLD1.8  {d17}, [r0 :64]
+        VMOVL.S8 q9, d18
+        VORR    q11, q10, q10
+        ADD     r2, r4, r2
+        VMOVL.S8 q8, d17
+        LDR     r0, [r9]
+        LDR     r1, [r9, #4]
+        VMLAL.S16 q6, d19, d6[0]
+        STR     r0, [sp, #320]
+        VORR    q12, q3, q3
+        VMLAL.S16 q14, d18, d6[0]
+        ADD     r0, sp, #320
+        VORR    q3, q10, q10
+        STR     r1, [sp, #324]
+        VORR    q10, q8, q8
+        VMLAL.S16 q4, d19, d6[0]
+        VMLAL.S16 q2, d18, d6[0]
+        VORR    q3, q8, q8
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMLAL.S16 q7, d19, d6[0]
+        VMLAL.S16 q15, d18, d6[0]
+        VMOVL.S8 q3, d16
+        VLDMIA  r0, {d16-d17}
+        ADD     r0, sp, #240
+        VMLAL.S16 q8, d18, d6[0]
+        VMLAL.S16 q5, d19, d6[0]
+        VSTMIA  r0, {d16-d17}
+        BNE     6f
+        LDR     r8, [sp, #224]
+        MOV     r9, #32
+        LDR     fp, [sp, #76]
+        B       8f
+6:
+        LDR     r5, [sp, #224]
+        VORR    q13, q3, q3
+        VORR    q3, q12, q12
+        CMP     r4, #3
+        MOV     r9, #32
+        LDR     r0, [r5]
+        LDR     r1, [r5, #4]
+        STR     r0, [sp, #312]
+        ADD     r0, sp, #312
+        STR     r1, [sp, #316]
+        ADD     r1, r5, #8
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMOVL.S8 q8, d16
+        VLDMIA  r0, {d18-d19}
+        ADD     r0, sp, #240
+        LDR     fp, [sp, #76]
+        VMLAL.S16 q6, d17, d6[1]
+        VMLAL.S16 q14, d16, d6[1]
+        VORR    q3, q11, q11
+        VMLAL.S16 q4, d17, d6[1]
+        VMLAL.S16 q2, d16, d6[1]
+        VORR    q3, q10, q10
+        VMLAL.S16 q7, d17, d6[1]
+        VMLAL.S16 q15, d16, d6[1]
+        VORR    q3, q13, q13
+        VMLAL.S16 q9, d16, d6[1]
+        VMLAL.S16 q5, d17, d6[1]
+        VSTMIA  r0, {d18-d19}
+        BCC     7f
+        LDR     r0, [r1]
+        VORR    q3, q12, q12
+        LDR     r1, [r1, #4]
+        CMP     r4, #3
+        STR     r0, [sp, #304]
+        ADD     r0, sp, #304
+        STR     r1, [sp, #308]
+        ADD     r1, r5, #16
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMOVL.S8 q8, d16
+        VLDMIA  r0, {d18-d19}
+        ADD     r0, sp, #240
+        VMLAL.S16 q6, d17, d6[2]
+        VMLAL.S16 q14, d16, d6[2]
+        VORR    q3, q11, q11
+        VMLAL.S16 q4, d17, d6[2]
+        VMLAL.S16 q2, d16, d6[2]
+        VORR    q3, q10, q10
+        VMLAL.S16 q7, d17, d6[2]
+        VMLAL.S16 q15, d16, d6[2]
+        VORR    q3, q13, q13
+        VMLAL.S16 q9, d16, d6[2]
+        VMLAL.S16 q5, d17, d6[2]
+        VSTMIA  r0, {d18-d19}
+        BEQ     7f
+        LDR     r0, [r1]
+        VORR    q3, q12, q12
+        LDR     r1, [r1, #4]
+        CMP     r4, #5
+        STR     r0, [sp, #296]
+        ADD     r0, sp, #296
+        STR     r1, [sp, #300]
+        ADD     r1, r5, #24
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMOVL.S8 q8, d16
+        VLDMIA  r0, {d18-d19}
+        ADD     r0, sp, #240
+        VMLAL.S16 q6, d17, d6[3]
+        VMLAL.S16 q14, d16, d6[3]
+        VORR    q3, q11, q11
+        VMLAL.S16 q4, d17, d6[3]
+        VMLAL.S16 q2, d16, d6[3]
+        VORR    q3, q10, q10
+        VMLAL.S16 q7, d17, d6[3]
+        VMLAL.S16 q15, d16, d6[3]
+        VORR    q3, q13, q13
+        VMLAL.S16 q9, d16, d6[3]
+        VMLAL.S16 q5, d17, d6[3]
+        VSTMIA  r0, {d18-d19}
+        BCC     7f
+        LDR     r0, [r1]
+        VORR    q3, q12, q12
+        LDR     r1, [r1, #4]
+        CMP     r4, #5
+        STR     r0, [sp, #288]
+        ADD     r0, sp, #288
+        STR     r1, [sp, #292]
+        ADD     r1, r5, #32
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMOVL.S8 q8, d16
+        VLDMIA  r0, {d18-d19}
+        ADD     r0, sp, #240
+        VMLAL.S16 q6, d17, d7[0]
+        VMLAL.S16 q14, d16, d7[0]
+        VORR    q3, q11, q11
+        VMLAL.S16 q4, d17, d7[0]
+        VMLAL.S16 q2, d16, d7[0]
+        VORR    q3, q10, q10
+        VMLAL.S16 q7, d17, d7[0]
+        VMLAL.S16 q15, d16, d7[0]
+        VORR    q3, q13, q13
+        VMLAL.S16 q9, d16, d7[0]
+        VMLAL.S16 q5, d17, d7[0]
+        VSTMIA  r0, {d18-d19}
+        BEQ     7f
+        LDR     r0, [r1]
+        VORR    q3, q12, q12
+        LDR     r1, [r1, #4]
+        CMP     r4, #7
+        STR     r0, [sp, #280]
+        ADD     r0, sp, #280
+        STR     r1, [sp, #284]
+        ADD     r1, r5, #40
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMOVL.S8 q8, d16
+        VLDMIA  r0, {d18-d19}
+        ADD     r0, sp, #240
+        VMLAL.S16 q6, d17, d7[1]
+        VMLAL.S16 q14, d16, d7[1]
+        VORR    q3, q11, q11
+        VMLAL.S16 q4, d17, d7[1]
+        VMLAL.S16 q2, d16, d7[1]
+        VORR    q3, q10, q10
+        VMLAL.S16 q7, d17, d7[1]
+        VMLAL.S16 q15, d16, d7[1]
+        VORR    q3, q13, q13
+        VMLAL.S16 q9, d16, d7[1]
+        VMLAL.S16 q5, d17, d7[1]
+        VSTMIA  r0, {d18-d19}
+        BCC     7f
+        LDR     r0, [r1]
+        VORR    q3, q12, q12
+        LDR     r1, [r1, #4]
+        ADD     r8, r8, #56
+        STR     r0, [sp, #272]
+        ADD     r0, sp, #272
+        STR     r1, [sp, #276]
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMOVL.S8 q8, d16
+        VLDMIA  r0, {d18-d19}
+        ADD     r0, sp, #240
+        VMLAL.S16 q6, d17, d7[2]
+        VMLAL.S16 q14, d16, d7[2]
+        VORR    q3, q11, q11
+        VMLAL.S16 q4, d17, d7[2]
+        VMLAL.S16 q2, d16, d7[2]
+        VORR    q3, q10, q10
+        VMLAL.S16 q7, d17, d7[2]
+        VMLAL.S16 q15, d16, d7[2]
+        VORR    q3, q13, q13
+        VMLAL.S16 q9, d16, d7[2]
+        VMLAL.S16 q5, d17, d7[2]
+        VSTMIA  r0, {d18-d19}
+        B       8f
+7:
+        MOV     r8, r1
+8:
+        LDR     r0, [sp, #72]
+        LDR     r1, [sp, #56]
+        LDR     r5, [sp, #576]
+        B       4b
+
+        # Store odd width
+9:
+        TST     lr, #4
+        BEQ     10f
+        VST1.32 {d22[0]}, [ip]!
+        VST1.32 {d23[0]}, [sl]!
+        VST1.32 {d18[0]}, [fp]!
+        VST1.32 {d19[0]}, [r0]!
+        VEXT.8  q9, q9, q9, #4
+        VEXT.8  q11, q11, q11, #4
+10:
+        TST     lr, #2
+        BEQ     11f
+        VST1.16 {d22[0]}, [ip]!
+        VST1.16 {d23[0]}, [sl]!
+        VST1.16 {d18[0]}, [fp]!
+        VST1.16 {d19[0]}, [r0]!
+        VEXT.8  q9, q9, q9, #2
+        VEXT.8  q11, q11, q11, #2
+11:
+        TST     lr, #1
+        BEQ     12f
+        VST1.8  {d22[0]}, [ip]
+        VST1.8  {d23[0]}, [sl]
+        VST1.8  {d18[0]}, [fp]
+        VST1.8  {d19[0]}, [r0]
+12:
+        ADD     sp, sp, #460  // skip over r2.
+        POP     {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+        VPOP    {d8-d15}
+        BX      lr
+
+END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..0969ac9
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,747 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x8-aarch32-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(
+//     size_t mr,                            r0
+//     size_t nc,                            r1
+//     size_t kc,                            r2 -> r5
+//     const uint8_t*restrict a,             r3
+//     size_t a_stride,          sp + 96  -> (r7)
+//     const void*restrict w,    sp + 100 -> r9
+//     uint8_t*restrict c,       sp + 104 -> r11
+//     size_t cm_stride,         sp + 108 -> (r6)
+//     size_t cn_stride,         sp + 112 -> r7
+//     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])  sp + 116 -> (r7)
+
+
+// inner loop registers
+
+// A0  r7  d0
+// A1  r6  d1
+// A2  r2  d2
+// A3  r3  d3
+
+// B    r9  d8,  d9, d10, d11
+// B       d12, d13, d14, d15
+
+// C3  ip r12  [240]q9  q5
+// C2  sl r10       q15 q7
+// C1  fp r11       q2  q4
+// C0  r0           q6  q14
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64
+        .arm
+#ifndef __APPLE__
+        .arch   armv7-a
+        .fpu    neon
+#endif
+        # Push 104 bytes.  r2 is for kc reset
+        VPUSH   {d8-d15}      // 64 bytes
+        PUSH    {r2, r4, r5, r6, r7, r8, r9, sl, fp, lr}   // 40 bytes
+        SUB     sp, sp, #456  // +456 = 560 bytes.  TODO eliminate
+        MOV     lr, r1
+        LDR     r1, [sp, #560]
+        CMP     r0, #2
+        MOV     r4, r3
+        ADDCS   r4, r4, r1
+        CMP     r0, #3
+        LDR     r9, [sp, #580]
+        MOV     r8, #15
+        MOV     r6, r4
+        LDR     ip, [sp, #568]
+        ADDCS   r6, r6, r1
+        CMP     r0, #4
+        LDR     r5, [sp, #572]
+        MOV     r7, r6
+        MOV     sl, ip
+        ADDEQ   r7, r7, r1
+        MOV     r1, r9
+        VLD1.32 {d16-d17}, [r1], r8
+        CMP     r0, #2
+        ADDCS   sl, sl, r5
+        CMP     r0, #3
+        VLD1.8  {d18-d19}, [r1]
+        ADD     r1, r9, #4
+        MOV     fp, sl
+        VLD1.32 {d20-d21}, [r1]
+        ADD     r1, r9, #8
+        ADDCS   fp, fp, r5
+        CMP     r0, #4
+        VLD1.64 {d22-d23}, [r1]
+        ADD     r1, r9, #12
+        MOV     r0, fp
+        VLD1.32 {d24-d25}, [r1]
+        ADD     r1, r9, #14
+        ADDEQ   r0, r0, r5
+        MOV     r9, #32
+        VLD1.16 {d26-d27}, [r1]
+        MOV     r1, r2
+        MOV     r2, r4
+        ADD     r4, sp, #128
+        VDUP.32 q0, d16[0]
+        LDR     r5, [sp, #576]
+        LDR     r8, [sp, #564]  // w
+        VDUP.8  q8, d26[0]
+        STR     r1, [sp, #56]
+        VDUP.8  q1, d18[0]
+        VSTMIA  r4, {d16-d17}
+        ADD     r4, sp, #112
+        VDUP.16 q8, d24[0]
+        VSTMIA  r4, {d16-d17}
+        ADD     r4, sp, #96
+        VDUP.32 q8, d22[0]
+        VSTMIA  r4, {d16-d17}
+        ADD     r4, sp, #80
+        VDUP.32 q8, d20[0]
+        VSTMIA  r4, {d16-d17}
+        ADD     r4, sp, #32
+        VSTMIA  r4, {d0-d1}
+        ADD     r4, sp, #16
+        VSTMIA  r4, {d2-d3}
+0:
+        # Load initial bias from w into accumulators
+        ADD     r4, r8, #16
+        VLD1.8  {d16-d17}, [r8], r9   // Bias
+        CMP     r1, #8
+        STR     lr, [sp, #12]
+        ADD     lr, sp, #240
+        VLD1.8  {d10-d11}, [r4]
+        VSTMIA  lr, {d16-d17}
+        LDR     lr, [sp, #12]
+        BCC     2f                     // less than 8 channels?  skip main loop
+        STR     lr, [sp, #64]
+        ADD     lr, sp, #240
+        VORR    q7, q5, q5
+        STR     ip, [sp, #68]
+        VLDMIA  lr, {d6-d7}
+        VORR    q14, q5, q5
+        VORR    q6, q5, q5
+        MOV     ip, #0
+        VORR    q15, q3, q3
+        MOV     r4, r1
+        VORR    q4, q3, q3
+        STR     r0, [sp, #72]
+        STR     fp, [sp, #76]
+        STR     sl, [sp, #60]
+        STR     r7, [sp, #156]
+
+        # Main loop - 8 bytes of A
+
+1:
+        MOV     r9, r4
+        MOV     r4, r7
+        LDR     lr, [r8, #4]
+        MOV     r7, r6
+        MOV     r6, r2
+        LDR     sl, [r4, ip]!
+        LDR     fp, [r8]
+        LDR     r2, [r8, #8]
+        LDR     r1, [r8, #12]
+        STR     lr, [sp, #364]
+        ADD     lr, sp, #192
+        LDR     r0, [r8, #24]
+        STR     fp, [sp, #360]
+        LDR     r5, [r8, #20]
+        STR     r2, [sp, #384]
+        ADD     r2, sp, #360
+        STR     r1, [sp, #388]
+        VLD1.8  {d16}, [r2 :64]
+        ADD     r2, sp, #384
+        VLD1.8  {d17}, [r2 :64]
+        VMOVL.S8 q10, d16
+        LDR     r1, [r8, #28]
+        LDR     r2, [r8, #16]
+        STR     sl, [sp, #416]
+        STR     r1, [sp, #380]
+        STR     r0, [sp, #376]
+        STR     r5, [sp, #372]
+        STR     r2, [sp, #368]
+        LDR     r0, [r4, #4]
+        MOV     r4, r9
+        STR     r0, [sp, #420]
+        ADD     r0, sp, #416
+        SUB     r4, r9, #8
+        VLD1.8  {d18}, [r0 :64]
+        ADD     r0, sp, #376
+        CMP     r4, #7
+        VMOVL.S8 q0, d18
+        VLD1.8  {d16}, [r0 :64]
+        VMOVL.S8 q9, d17
+        ADD     r0, sp, #368
+        VMLAL.S16 q6, d21, d0[0]
+        VLD1.8  {d17}, [r0 :64]
+        VORR    q11, q9, q9
+        PLD     [r8, #480]
+        LDR     r0, [r8, #32]
+        VMLAL.S16 q3, d20, d0[0]
+        LDR     r1, [r8, #36]
+        VMLAL.S16 q6, d19, d0[1]
+        LDR     r2, [r8, #40]
+        VMOVL.S8 q9, d17
+        LDR     r5, [r8, #44]
+        STR     r0, [sp, #424]
+        ADD     r0, sp, #424
+        STR     r1, [sp, #428]
+        VMLAL.S16 q6, d19, d0[2]
+        VORR    q1, q9, q9
+        VMOVL.S8 q9, d16
+        VMLAL.S16 q6, d19, d0[3]
+        VSTMIA  lr, {d18-d19}
+        ADD     lr, sp, #176
+        VORR    q9, q11, q11
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #432
+        VMLAL.S16 q3, d18, d0[1]
+        VMOVL.S8 q8, d16
+        STR     r2, [sp, #432]
+        STR     r5, [sp, #436]
+        MOV     r2, r6
+        MOV     r6, r7
+        LDR     r7, [sp, #156]
+        VMLAL.S16 q6, d17, d1[0]
+        LDR     r1, [r8, #48]
+        VSTMIA  lr, {d16-d17}
+        ADD     lr, sp, #208
+        VLD1.8  {d16}, [r0 :64]
+        LDR     r0, [r8, #52]
+        VMOVL.S8 q8, d16
+        STR     r0, [sp, #444]
+        ADD     r0, sp, #440
+        STR     r1, [sp, #440]
+        VLD1.8  {d24}, [r0 :64]
+        VMLAL.S16 q6, d17, d1[1]
+        VORR    q2, q8, q8
+        LDR     r0, [r8, #60]
+        VMOVL.S8 q8, d24
+        LDR     r1, [r8, #56]
+        ADD     r8, r8, #64
+        STR     r0, [sp, #452]
+        ADD     r0, sp, #448
+        STR     r1, [sp, #448]
+        VMLAL.S16 q6, d17, d1[2]
+        VLD1.8  {d26}, [r0 :64]
+        VORR    q12, q8, q8
+        MOV     r0, r6
+        VMOVL.S8 q13, d26
+        LDR     r1, [r0, ip]!
+        VMLAL.S16 q6, d27, d1[3]
+        VSTMIA  lr, {d12-d13}
+        VORR    q6, q10, q10
+        VORR    q10, q1, q1
+        ADD     lr, sp, #256
+        VORR    q1, q12, q12
+        VSTMIA  lr, {d22-d23}
+        ADD     lr, sp, #192
+        VMLAL.S16 q3, d20, d0[2]
+        VLDMIA  lr, {d22-d23}
+        ADD     lr, sp, #176
+        VLDMIA  lr, {d16-d17}
+        ADD     lr, sp, #160
+        VMLAL.S16 q3, d22, d0[3]
+        STR     r1, [sp, #408]
+        LDR     r0, [r0, #4]
+        STR     r0, [sp, #412]
+        ADD     r0, sp, #408
+        VMLAL.S16 q3, d16, d1[0]
+        VMLAL.S16 q3, d4, d1[1]
+        VMLAL.S16 q3, d24, d1[2]
+        VORR    q12, q8, q8
+        VMLAL.S16 q3, d26, d1[3]
+        VLD1.8  {d0}, [r0 :64]
+        MOV     r0, r2
+        VMOVL.S8 q0, d0
+        VSTMIA  lr, {d4-d5}
+        ADD     lr, sp, #256
+        LDR     r1, [r0, ip]!
+        VMLAL.S16 q14, d13, d0[0]
+        VMLAL.S16 q4, d12, d0[0]
+        VMLAL.S16 q14, d19, d0[1]
+        VORR    q9, q10, q10
+        VMLAL.S16 q14, d21, d0[2]
+        VORR    q10, q6, q6
+        VMLAL.S16 q14, d23, d0[3]
+        VMLAL.S16 q14, d17, d1[0]
+        VORR    q8, q2, q2
+        VLDMIA  lr, {d4-d5}
+        ADD     lr, sp, #160
+        VMLAL.S16 q4, d4, d0[1]
+        STR     r1, [sp, #400]
+        VMLAL.S16 q14, d17, d1[1]
+        LDR     r0, [r0, #4]
+        STR     r0, [sp, #404]
+        ADD     r0, sp, #400
+        VMLAL.S16 q4, d18, d0[2]
+        VMLAL.S16 q14, d3, d1[2]
+        VMLAL.S16 q4, d22, d0[3]
+        VMLAL.S16 q14, d27, d1[3]
+        VMLAL.S16 q4, d24, d1[0]
+        VMLAL.S16 q4, d16, d1[1]
+        VLDMIA  lr, {d16-d17}
+        ADD     lr, sp, #224
+        VMLAL.S16 q4, d2, d1[2]
+        VMLAL.S16 q4, d26, d1[3]
+        VLD1.8  {d0}, [r0 :64]
+        MOV     r0, r3
+        VMOVL.S8 q0, d0
+        LDR     r1, [r0, ip]!
+        ADD     ip, ip, #8
+        VMLAL.S16 q7, d13, d0[0]
+        VMLAL.S16 q15, d20, d0[0]
+        VMLAL.S16 q7, d5, d0[1]
+        VORR    q2, q9, q9
+        VMLAL.S16 q7, d19, d0[2]
+        VORR    q9, q1, q1
+        VMLAL.S16 q7, d23, d0[3]
+        VMLAL.S16 q7, d25, d1[0]
+        VMLAL.S16 q7, d17, d1[1]
+        VMLAL.S16 q7, d3, d1[2]
+        VMLAL.S16 q7, d27, d1[3]
+        VSTMIA  lr, {d14-d15}
+        ADD     lr, sp, #240
+        STR     r1, [sp, #392]
+        LDR     r0, [r0, #4]
+        STR     r0, [sp, #396]
+        ADD     r0, sp, #392
+        VLDMIA  lr, {d12-d13}
+        ADD     lr, sp, #256
+        VLD1.8  {d2}, [r0 :64]
+        VMOVL.S8 q1, d2
+        VLDMIA  lr, {d14-d15}
+        ADD     lr, sp, #224
+        VMLAL.S16 q15, d14, d0[1]
+        VMLAL.S16 q6, d20, d2[0]
+        VMLAL.S16 q5, d21, d2[0]
+        VMLAL.S16 q15, d4, d0[2]
+        VMLAL.S16 q6, d14, d2[1]
+        VMLAL.S16 q5, d15, d2[1]
+        VLDMIA  lr, {d14-d15}
+        ADD     lr, sp, #240
+        VMLAL.S16 q15, d22, d0[3]
+        VMLAL.S16 q6, d4, d2[2]
+        VMLAL.S16 q5, d5, d2[2]
+        VMLAL.S16 q15, d24, d1[0]
+        VMLAL.S16 q6, d22, d2[3]
+        VMLAL.S16 q5, d23, d2[3]
+        VMLAL.S16 q15, d16, d1[1]
+        VMLAL.S16 q6, d24, d3[0]
+        VMLAL.S16 q5, d25, d3[0]
+        VMLAL.S16 q15, d18, d1[2]
+        VMLAL.S16 q6, d16, d3[1]
+        VMLAL.S16 q5, d17, d3[1]
+        VMLAL.S16 q15, d26, d1[3]
+        VMLAL.S16 q6, d18, d3[2]
+        VMLAL.S16 q5, d19, d3[2]
+        VMLAL.S16 q6, d26, d3[3]
+        VMLAL.S16 q5, d27, d3[3]
+        VSTMIA  lr, {d12-d13}
+        ADD     lr, sp, #208
+        VLDMIA  lr, {d12-d13}
+        BHI     1b
+        ADD     r5, sp, #32
+        ADD     lr, sp, #56
+        VORR    q2, q4, q4
+        ADD     r7, r7, ip
+        VLDMIA  r5, {d0-d1}
+        ADD     r5, sp, #16
+        VORR    q4, q14, q14
+        ADD     r6, r6, ip
+        VLDMIA  r5, {d2-d3}
+        ADD     r2, r2, ip
+        ADD     r3, r3, ip
+        VORR    q14, q3, q3
+        LDR     ip, [sp, #68]
+        MOV     r9, #32
+        LDR     fp, [sp, #76]
+        LDR     r0, [sp, #72]
+        LDR     r5, [sp, #576]
+        LDM     lr, {r1, sl, lr}
+        B       3f
+2:
+        STR     lr, [sp, #12]
+        ADD     lr, sp, #240
+        VORR    q15, q5, q5
+        MOV     r4, r1
+        VLDMIA  lr, {d16-d17}
+        VORR    q6, q5, q5
+        VORR    q4, q5, q5
+        LDR     lr, [sp, #12]
+        VORR    q14, q8, q8
+        VORR    q2, q8, q8
+        VORR    q7, q5, q5
+        VORR    q15, q8, q8
+3:
+        CMP     r4, #0
+        BNE     5f
+
+        # rndnu quantization
+        # C3 [240]q9  q5
+        # C2      q15 q7
+        # C1      q2  q4
+        # C0      q6  q14
+
+4:
+        ADD     r4, sp, #80
+        VSHL.S32 q11, q2, q0
+        CMP     lr, #7
+        VLDMIA  r4, {d4-d5}
+
+        VSHL.S32 q13, q15, q0
+        VLDR    q15, [sp, 240]           // q15 spilled
+
+        VSHL.S32 q8, q6, q0
+        VSHL.S32 q9, q14, q0
+        VSHL.S32 q10, q4, q0
+        VSHL.S32 q12, q7, q0
+        VSHL.S32 q14, q5, q0
+        VSHL.S32 q15, q15, q0
+
+        VQDMULH.S32 q8, q8, q2
+        VQDMULH.S32 q9, q9, q2
+        VQDMULH.S32 q10, q10, q2
+        VQDMULH.S32 q12, q12, q2
+        VQDMULH.S32 q11, q11, q2
+        VQDMULH.S32 q13, q13, q2
+        VQDMULH.S32 q14, q14, q2
+        VQDMULH.S32 q15, q15, q2
+        VLDMIA  r4, {d4-d5}
+        VRSHL.S32 q8, q8, q2
+        VRSHL.S32 q9, q9, q2
+        VRSHL.S32 q10, q10, q2
+        VRSHL.S32 q12, q12, q2
+        VRSHL.S32 q11, q11, q2
+        VRSHL.S32 q13, q13, q2
+        VRSHL.S32 q14, q14, q2
+        VRSHL.S32 q15, q15, q2
+        VQMOVN.S32 d17, q8
+        VQMOVN.S32 d16, q9
+        VQMOVN.S32 d19, q10
+        VQMOVN.S32 d21, q12
+        VLDMIA  r4, {d24-d25}
+        VQMOVN.S32 d18, q11
+        VQMOVN.S32 d20, q13
+        VQMOVN.S32 d23, q14
+        VQMOVN.S32 d22, q15
+        VQADD.S16 q8, q8, q12
+        VQADD.S16 q9, q9, q12
+        VQADD.S16 q10, q10, q12
+        VQADD.S16 q11, q11, q12
+        VQMOVN.S16 d17, q8
+        VQMOVN.S16 d16, q9
+        VQMOVN.S16 d19, q10
+        VQMOVN.S16 d18, q11
+        VLDMIA  r4, {d20-d21}
+        VMAX.S8 q8, q8, q10
+        VMAX.S8 q10, q9, q10
+        SUBS    lr, lr, #8
+        VMIN.S8 q9, q8, q1
+        VMIN.S8 q11, q10, q1
+        BLS     9f
+
+        # Store full 4 x 8
+        VST1.8  {d22}, [ip], r5
+        SUB     r7, r7, r1
+        VST1.8  {d23}, [sl], r5
+        SUB     r6, r6, r1
+        VST1.8  {d18}, [fp], r5
+        SUB     r2, r2, r1
+        VST1.8  {d19}, [r0], r5
+        SUB     r3, r3, r1
+        BNE     0b
+
+        ADD     sp, sp, #460  // skip over r2.
+        POP     {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+        VPOP    {d8-d15}
+        BX      lr
+
+5:
+        STR     r0, [sp, #72]
+        ADD     r0, r8, #8
+        STR     r0, [sp, #224]
+        MOV     r5, r7
+        MOV     r7, r6
+        LDR     r0, [r8]
+        STR     r0, [sp, #256]
+        MOV     r6, r2
+        LDR     r2, [r7]
+        MOV     r9, r3
+        LDR     r1, [r7, #4]
+        CMP     r4, #1
+        LDR     r0, [r5, #4]
+        STR     fp, [sp, #76]
+        LDR     fp, [r8, #4]
+        LDR     r3, [r5]
+        STR     r1, [sp, #340]
+        STR     r2, [sp, #336]
+        STR     r0, [sp, #348]
+        LDR     r0, [r6]
+        LDR     r2, [r6, #4]
+        STR     r3, [sp, #344]
+        MOV     r3, r9
+        STR     fp, [sp, #356]
+        ADD     r3, r4, r9
+        LDR     r1, [sp, #256]
+        STR     r1, [sp, #352]
+        STR     r0, [sp, #328]
+        ADD     r0, sp, #336
+        STR     r2, [sp, #332]
+        MOV     r2, r6
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #344
+        MOV     r6, r7
+        MOV     r7, r5
+        VLD1.8  {d17}, [r0 :64]
+        ADD     r0, sp, #352
+        VMOVL.S8 q10, d16
+        ADD     r7, r4, r5
+        VLD1.8  {d18}, [r0 :64]
+        ADD     r0, sp, #328
+        VMOVL.S8 q3, d17
+        ADD     r6, r4, r6
+        VLD1.8  {d17}, [r0 :64]
+        VMOVL.S8 q9, d18
+        VORR    q11, q10, q10
+        ADD     r2, r4, r2
+        VMOVL.S8 q8, d17
+        LDR     r0, [r9]
+        LDR     r1, [r9, #4]
+        VMLAL.S16 q6, d19, d6[0]
+        STR     r0, [sp, #320]
+        VORR    q12, q3, q3
+        VMLAL.S16 q14, d18, d6[0]
+        ADD     r0, sp, #320
+        VORR    q3, q10, q10
+        STR     r1, [sp, #324]
+        VORR    q10, q8, q8
+        VMLAL.S16 q4, d19, d6[0]
+        VMLAL.S16 q2, d18, d6[0]
+        VORR    q3, q8, q8
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMLAL.S16 q7, d19, d6[0]
+        VMLAL.S16 q15, d18, d6[0]
+        VMOVL.S8 q3, d16
+        VLDMIA  r0, {d16-d17}
+        ADD     r0, sp, #240
+        VMLAL.S16 q8, d18, d6[0]
+        VMLAL.S16 q5, d19, d6[0]
+        VSTMIA  r0, {d16-d17}
+        BNE     6f
+        LDR     r8, [sp, #224]
+        MOV     r9, #32
+        LDR     fp, [sp, #76]
+        B       8f
+6:
+        LDR     r5, [sp, #224]
+        VORR    q13, q3, q3
+        VORR    q3, q12, q12
+        CMP     r4, #3
+        MOV     r9, #32
+        LDR     r0, [r5]
+        LDR     r1, [r5, #4]
+        STR     r0, [sp, #312]
+        ADD     r0, sp, #312
+        STR     r1, [sp, #316]
+        ADD     r1, r5, #8
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMOVL.S8 q8, d16
+        VLDMIA  r0, {d18-d19}
+        ADD     r0, sp, #240
+        LDR     fp, [sp, #76]
+        VMLAL.S16 q6, d17, d6[1]
+        VMLAL.S16 q14, d16, d6[1]
+        VORR    q3, q11, q11
+        VMLAL.S16 q4, d17, d6[1]
+        VMLAL.S16 q2, d16, d6[1]
+        VORR    q3, q10, q10
+        VMLAL.S16 q7, d17, d6[1]
+        VMLAL.S16 q15, d16, d6[1]
+        VORR    q3, q13, q13
+        VMLAL.S16 q9, d16, d6[1]
+        VMLAL.S16 q5, d17, d6[1]
+        VSTMIA  r0, {d18-d19}
+        BCC     7f
+        LDR     r0, [r1]
+        VORR    q3, q12, q12
+        LDR     r1, [r1, #4]
+        CMP     r4, #3
+        STR     r0, [sp, #304]
+        ADD     r0, sp, #304
+        STR     r1, [sp, #308]
+        ADD     r1, r5, #16
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMOVL.S8 q8, d16
+        VLDMIA  r0, {d18-d19}
+        ADD     r0, sp, #240
+        VMLAL.S16 q6, d17, d6[2]
+        VMLAL.S16 q14, d16, d6[2]
+        VORR    q3, q11, q11
+        VMLAL.S16 q4, d17, d6[2]
+        VMLAL.S16 q2, d16, d6[2]
+        VORR    q3, q10, q10
+        VMLAL.S16 q7, d17, d6[2]
+        VMLAL.S16 q15, d16, d6[2]
+        VORR    q3, q13, q13
+        VMLAL.S16 q9, d16, d6[2]
+        VMLAL.S16 q5, d17, d6[2]
+        VSTMIA  r0, {d18-d19}
+        BEQ     7f
+        LDR     r0, [r1]
+        VORR    q3, q12, q12
+        LDR     r1, [r1, #4]
+        CMP     r4, #5
+        STR     r0, [sp, #296]
+        ADD     r0, sp, #296
+        STR     r1, [sp, #300]
+        ADD     r1, r5, #24
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMOVL.S8 q8, d16
+        VLDMIA  r0, {d18-d19}
+        ADD     r0, sp, #240
+        VMLAL.S16 q6, d17, d6[3]
+        VMLAL.S16 q14, d16, d6[3]
+        VORR    q3, q11, q11
+        VMLAL.S16 q4, d17, d6[3]
+        VMLAL.S16 q2, d16, d6[3]
+        VORR    q3, q10, q10
+        VMLAL.S16 q7, d17, d6[3]
+        VMLAL.S16 q15, d16, d6[3]
+        VORR    q3, q13, q13
+        VMLAL.S16 q9, d16, d6[3]
+        VMLAL.S16 q5, d17, d6[3]
+        VSTMIA  r0, {d18-d19}
+        BCC     7f
+        LDR     r0, [r1]
+        VORR    q3, q12, q12
+        LDR     r1, [r1, #4]
+        CMP     r4, #5
+        STR     r0, [sp, #288]
+        ADD     r0, sp, #288
+        STR     r1, [sp, #292]
+        ADD     r1, r5, #32
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMOVL.S8 q8, d16
+        VLDMIA  r0, {d18-d19}
+        ADD     r0, sp, #240
+        VMLAL.S16 q6, d17, d7[0]
+        VMLAL.S16 q14, d16, d7[0]
+        VORR    q3, q11, q11
+        VMLAL.S16 q4, d17, d7[0]
+        VMLAL.S16 q2, d16, d7[0]
+        VORR    q3, q10, q10
+        VMLAL.S16 q7, d17, d7[0]
+        VMLAL.S16 q15, d16, d7[0]
+        VORR    q3, q13, q13
+        VMLAL.S16 q9, d16, d7[0]
+        VMLAL.S16 q5, d17, d7[0]
+        VSTMIA  r0, {d18-d19}
+        BEQ     7f
+        LDR     r0, [r1]
+        VORR    q3, q12, q12
+        LDR     r1, [r1, #4]
+        CMP     r4, #7
+        STR     r0, [sp, #280]
+        ADD     r0, sp, #280
+        STR     r1, [sp, #284]
+        ADD     r1, r5, #40
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMOVL.S8 q8, d16
+        VLDMIA  r0, {d18-d19}
+        ADD     r0, sp, #240
+        VMLAL.S16 q6, d17, d7[1]
+        VMLAL.S16 q14, d16, d7[1]
+        VORR    q3, q11, q11
+        VMLAL.S16 q4, d17, d7[1]
+        VMLAL.S16 q2, d16, d7[1]
+        VORR    q3, q10, q10
+        VMLAL.S16 q7, d17, d7[1]
+        VMLAL.S16 q15, d16, d7[1]
+        VORR    q3, q13, q13
+        VMLAL.S16 q9, d16, d7[1]
+        VMLAL.S16 q5, d17, d7[1]
+        VSTMIA  r0, {d18-d19}
+        BCC     7f
+        LDR     r0, [r1]
+        VORR    q3, q12, q12
+        LDR     r1, [r1, #4]
+        ADD     r8, r8, #56
+        STR     r0, [sp, #272]
+        ADD     r0, sp, #272
+        STR     r1, [sp, #276]
+        VLD1.8  {d16}, [r0 :64]
+        ADD     r0, sp, #240
+        VMOVL.S8 q8, d16
+        VLDMIA  r0, {d18-d19}
+        ADD     r0, sp, #240
+        VMLAL.S16 q6, d17, d7[2]
+        VMLAL.S16 q14, d16, d7[2]
+        VORR    q3, q11, q11
+        VMLAL.S16 q4, d17, d7[2]
+        VMLAL.S16 q2, d16, d7[2]
+        VORR    q3, q10, q10
+        VMLAL.S16 q7, d17, d7[2]
+        VMLAL.S16 q15, d16, d7[2]
+        VORR    q3, q13, q13
+        VMLAL.S16 q9, d16, d7[2]
+        VMLAL.S16 q5, d17, d7[2]
+        VSTMIA  r0, {d18-d19}
+        B       8f
+7:
+        MOV     r8, r1
+8:
+        LDR     r0, [sp, #72]
+        LDR     r1, [sp, #56]
+        LDR     r5, [sp, #576]
+        B       4b
+
+        # Store odd width
+9:
+        TST     lr, #4
+        BEQ     10f
+        VST1.32 {d22[0]}, [ip]!
+        VST1.32 {d23[0]}, [sl]!
+        VST1.32 {d18[0]}, [fp]!
+        VST1.32 {d19[0]}, [r0]!
+        VEXT.8  q9, q9, q9, #4
+        VEXT.8  q11, q11, q11, #4
+10:
+        TST     lr, #2
+        BEQ     11f
+        VST1.16 {d22[0]}, [ip]!
+        VST1.16 {d23[0]}, [sl]!
+        VST1.16 {d18[0]}, [fp]!
+        VST1.16 {d19[0]}, [r0]!
+        VEXT.8  q9, q9, q9, #2
+        VEXT.8  q11, q11, q11, #2
+11:
+        TST     lr, #1
+        BEQ     12f
+        VST1.8  {d22[0]}, [ip]
+        VST1.8  {d23[0]}, [sl]
+        VST1.8  {d18[0]}, [fp]
+        VST1.8  {d19[0]}, [r0]
+12:
+        ADD     sp, sp, #460  // skip over r2.
+        POP     {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+        VPOP    {d8-d15}
+        BX      lr
+
+END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..1a4555e
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,391 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v5
+# C0  x6 v24 v28
+# C1  x8 v25 v29
+# C2  x9 v26 v30
+# C3  x7 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x12, x11, [sp]          // Load cn_stride, params
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q24, q28, [x5], 32
+        SUBS    x0, x2, 8               // k = kc - 8
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        # Is there at least 8 bytes for main loop?
+        B.LO    3f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LD1     {v0.8b},  [x3], 8
+        LDR     d5, [x5], 8
+        LD1     {v1.8b}, [x15], 8
+        LD1     {v2.8b}, [x13], 8
+        LD1     {v3.8b},  [x4], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    1b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 3f
+
+2:
+        # Apply params - preshift, scale, postshift, bias and clamp
+        LD1R    {v4.4s}, [x11], 4
+        SQSHL   v24.4s, v24.4s, v4.4s   // shift to upper bits
+        SQSHL   v25.4s, v25.4s, v4.4s
+        SQSHL   v26.4s, v26.4s, v4.4s
+        SQSHL   v27.4s, v27.4s, v4.4s
+        LD1R    {v5.4s}, [x11], 4
+        SQSHL   v28.4s, v28.4s, v4.4s
+        SQSHL   v29.4s, v29.4s, v4.4s
+        SQSHL   v30.4s, v30.4s, v4.4s
+        SQSHL   v31.4s, v31.4s, v4.4s
+        LD1R    {v6.4s}, [x11], 4
+        SQDMULH v24.4s, v24.4s, v5.4s   // scale without rounding
+        SQDMULH v25.4s, v25.4s, v5.4s
+        SQDMULH v26.4s, v26.4s, v5.4s
+        SQDMULH v27.4s, v27.4s, v5.4s
+        SQDMULH v28.4s, v28.4s, v5.4s
+        SQDMULH v29.4s, v29.4s, v5.4s
+        SQDMULH v30.4s, v30.4s, v5.4s
+        SQDMULH v31.4s, v31.4s, v5.4s
+        SRSHL   v24.4s, v24.4s, v6.4s   // signed rounding shift left
+        SRSHL   v25.4s, v25.4s, v6.4s
+        SRSHL   v26.4s, v26.4s, v6.4s
+        SRSHL   v27.4s, v27.4s, v6.4s
+        SRSHL   v28.4s, v28.4s, v6.4s
+        SRSHL   v29.4s, v29.4s, v6.4s
+        SRSHL   v30.4s, v30.4s, v6.4s
+        SRSHL   v31.4s, v31.4s, v6.4s
+
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+        LD1R    {v4.8b}, [x11], 1       // clamp min value
+
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v5.8b}, [x11]          // clamp max value
+        SQXTN  v0.8b, v24.8h
+        SQXTN  v1.8b, v25.8h
+        SQXTN  v2.8b, v26.8h
+        SQXTN  v3.8b, v27.8h
+        SUB     x11, x11, 15             // rewind params pointer
+
+        SMAX    v0.8b, v0.8b, v4.8b
+        SMAX    v1.8b, v1.8b, v4.8b
+        SMAX    v2.8b, v2.8b, v4.8b
+        SMAX    v3.8b, v3.8b, v4.8b
+        SUBS    x1, x1, 8
+        SMIN    v0.8b, v0.8b, v5.8b
+        SMIN    v1.8b, v1.8b, v5.8b
+        SMIN    v2.8b, v2.8b, v5.8b
+        SMIN    v3.8b, v3.8b, v5.8b
+        B.LO    4f
+
+        # Store full 4 x 8
+        ST1     {v0.8b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v1.8b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v2.8b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v3.8b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+3:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b},  [x3], x0
+        LDR     d5, [x5], 8
+        LD1     {v1.8b}, [x15], x0
+        LD1     {v2.8b}, [x13], x0
+        LD1     {v3.8b},  [x4], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    2b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    2b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    2b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    2b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    2b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    2b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       2b
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 2, 5f
+        STR     s0, [x6], 4
+        STR     s1, [x8], 4
+        DUP     s0, v0.s[1]
+        DUP     s1, v1.s[1]
+        STR     s2, [x9], 4
+        STR     s3, [x7], 4
+        DUP     s2, v2.s[1]
+        DUP     s3, v3.s[1]
+5:
+        TBZ     x1, 1, 6f
+        STR     h0, [x6], 2
+        STR     h1, [x8], 2
+        DUP     h0, v0.h[1]
+        DUP     h1, v1.h[1]
+        STR     h2, [x9], 2
+        STR     h3, [x7], 2
+        DUP     h2, v2.h[1]
+        DUP     h3, v3.h[1]
+6:
+        TBZ     x1, 0, 7f
+        STR     b0, [x6]
+        STR     b1, [x8]
+        STR     b2, [x9]
+        STR     b3, [x7]
+7:
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..c1bc2f8
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,397 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v5
+# C0  x6 v24 v28
+# C1  x8 v25 v29
+# C2  x9 v26 v30
+# C3  x7 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x12, x11, [sp]          // Load cn_stride, params
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q24, q28, [x5], 32
+        SUBS    x0, x2, 8               // k = kc - 8
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        # Is there at least 8 bytes for main loop?
+        B.LO    3f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LD1     {v0.8b},  [x3], 8
+        LDR     d5, [x5], 8
+        LD1     {v1.8b}, [x15], 8
+        LD1     {v2.8b}, [x13], 8
+        LD1     {v3.8b},  [x4], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        PRFM    PLDL1KEEP, [x13, 128]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x15, 128]
+        PRFM    PLDL1KEEP, [x3, 128]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x4, 128]
+        PRFM    PLDL1KEEP, [x5, 448]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 512]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    1b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 3f
+
+2:
+        # Apply params - preshift, scale, postshift, bias and clamp
+        LD1R    {v4.4s}, [x11], 4
+        SQSHL   v24.4s, v24.4s, v4.4s   // shift to upper bits
+        SQSHL   v25.4s, v25.4s, v4.4s
+        SQSHL   v26.4s, v26.4s, v4.4s
+        SQSHL   v27.4s, v27.4s, v4.4s
+        LD1R    {v5.4s}, [x11], 4
+        SQSHL   v28.4s, v28.4s, v4.4s
+        SQSHL   v29.4s, v29.4s, v4.4s
+        SQSHL   v30.4s, v30.4s, v4.4s
+        SQSHL   v31.4s, v31.4s, v4.4s
+        LD1R    {v6.4s}, [x11], 4
+        SQDMULH v24.4s, v24.4s, v5.4s   // scale without rounding
+        SQDMULH v25.4s, v25.4s, v5.4s
+        SQDMULH v26.4s, v26.4s, v5.4s
+        SQDMULH v27.4s, v27.4s, v5.4s
+        SQDMULH v28.4s, v28.4s, v5.4s
+        SQDMULH v29.4s, v29.4s, v5.4s
+        SQDMULH v30.4s, v30.4s, v5.4s
+        SQDMULH v31.4s, v31.4s, v5.4s
+        SRSHL   v24.4s, v24.4s, v6.4s   // signed rounding shift left
+        SRSHL   v25.4s, v25.4s, v6.4s
+        SRSHL   v26.4s, v26.4s, v6.4s
+        SRSHL   v27.4s, v27.4s, v6.4s
+        SRSHL   v28.4s, v28.4s, v6.4s
+        SRSHL   v29.4s, v29.4s, v6.4s
+        SRSHL   v30.4s, v30.4s, v6.4s
+        SRSHL   v31.4s, v31.4s, v6.4s
+
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+        LD1R    {v4.8b}, [x11], 1       // clamp min value
+
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v5.8b}, [x11]          // clamp max value
+        SQXTN  v0.8b, v24.8h
+        SQXTN  v1.8b, v25.8h
+        SQXTN  v2.8b, v26.8h
+        SQXTN  v3.8b, v27.8h
+        SUB     x11, x11, 15             // rewind params pointer
+
+        SMAX    v0.8b, v0.8b, v4.8b
+        SMAX    v1.8b, v1.8b, v4.8b
+        SMAX    v2.8b, v2.8b, v4.8b
+        SMAX    v3.8b, v3.8b, v4.8b
+        SUBS    x1, x1, 8
+        SMIN    v0.8b, v0.8b, v5.8b
+        SMIN    v1.8b, v1.8b, v5.8b
+        SMIN    v2.8b, v2.8b, v5.8b
+        SMIN    v3.8b, v3.8b, v5.8b
+        B.LO    4f
+
+        # Store full 4 x 8
+        ST1     {v0.8b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v1.8b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v2.8b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v3.8b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+3:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b},  [x3], x0
+        LDR     d5, [x5], 8
+        LD1     {v1.8b}, [x15], x0
+        LD1     {v2.8b}, [x13], x0
+        LD1     {v3.8b},  [x4], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    2b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    2b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    2b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    2b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    2b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    2b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       2b
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 2, 5f
+        STR     s0, [x6], 4
+        STR     s1, [x8], 4
+        DUP     s0, v0.s[1]
+        DUP     s1, v1.s[1]
+        STR     s2, [x9], 4
+        STR     s3, [x7], 4
+        DUP     s2, v2.s[1]
+        DUP     s3, v3.s[1]
+5:
+        TBZ     x1, 1, 6f
+        STR     h0, [x6], 2
+        STR     h1, [x8], 2
+        DUP     h0, v0.h[1]
+        DUP     h1, v1.h[1]
+        STR     h2, [x9], 2
+        STR     h3, [x7], 2
+        DUP     h2, v2.h[1]
+        DUP     h3, v3.h[1]
+6:
+        TBZ     x1, 0, 7f
+        STR     b0, [x6]
+        STR     b1, [x8]
+        STR     b2, [x9]
+        STR     b3, [x7]
+7:
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
index cb7b592..d3e8d6e 100644
--- a/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+++ b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
@@ -55,7 +55,7 @@
 # A1  x14  v1
 # A2  x15  v2
 # A3  x20  v3
-# B    x5  v4  v5  v6
+# B    x5  v4  v5
 # C0   x6 v16 v20 v24 v28
 # C1  x16 v17 v21 v25 v29
 # C2  x17 v18 v22 v26 v30
@@ -65,7 +65,6 @@
   # unused  v8 v9 v10 v11 v12 v13 v14 v15
 $else:
   # unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
 
@@ -81,7 +80,7 @@
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
         CMP     x0, 4                   // if mr < 4
-        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        STR     x20, [sp, -16]!         // Save x20 on stack
         ADD     x7,  x17, x7            // c3 = c2 + cm_stride
         CSEL    x7,  x17, x7, LO        //   c3 = c2
 
@@ -571,8 +570,8 @@
         # nc loop
         B.HI    0b
 
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
         # Remainder- 1 to 7 bytes of A
@@ -819,8 +818,8 @@
         STR     b1, [x16]
         STR     b0, [x6]
 9:
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
 END_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
diff --git a/src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in b/src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
new file mode 100644
index 0000000..f7f91b2
--- /dev/null
+++ b/src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
@@ -0,0 +1,559 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert REQUANTIZATION in ["FP32", "RNDNU"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
+$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
+$assert DATATYPE != "QU8" or REQUANTIZATION == "RNDNU"
+
+#include <xnnpack/assembly.h>
+
+$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+$if DATATYPE == "QU8":
+  $REWIND_DECREMENT = 19
+$else:
+  $REWIND_DECREMENT = 3 if CHANNELWISE else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
+$XMIN = "UMIN" if DATATYPE == "QU8" else "SMIN"
+$XMAX = "UMAX" if DATATYPE == "QU8" else "SMAX"
+$XXTL = "UXTL" if DATATYPE == "QU8" else "SXTL"
+$SQXTXN = "SQXTUN" if DATATYPE == "QU8" else "SQXTN"
+$SQXTXN2 = "SQXTUN2" if DATATYPE == "QU8" else "SQXTN2"
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
+# void xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const ${XINT8_T}** restrict a, x4
+#     const ${XINT8_T}* restrict w,  x5
+#     ${XINT8_T}* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x8
+#     const ${XINT8_T}* zero,                [sp + 16] -> x12
+#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+$if REQUANTIZATION == "RNDNU" and DATATYPE == "QU8":
+  # params structure is 20 bytes
+  #  struct {
+  #    ${XINT8_T} kernel_zero_point[4];
+  #    int32_t right_pre_shift;
+  #    int32_t multiplier;
+  #    int32_t right_post_shift;
+  #    int16_t output_zero_point;
+  #    ${XINT8_T} output_min;
+  #    ${XINT8_T} output_max;
+  #  } rndnu_neon;
+  #
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0
+# A1  x14  v1
+# A2  x15  v2
+# A3  x20  v3
+# B    x5  v5
+# C0   x6 v24 v28
+# C1  x16 v25 v29
+# C2  x17 v26 v30
+# C3   x7 v27 v31
+$if DATATYPE == "QU8":
+  # zero_point v7
+  # unused  v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+$else:
+  # unused  v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+
+BEGIN_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        STR     x20, [sp, -16]!         // Save x20 on stack
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+        $if DATATYPE == "QU8":
+          LD1R    {v7.4s}, [x11]          // kernel_zero_point
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q24, q28, [x5], 32
+        $if DATATYPE == "QU8":
+          ADD     x11, x11, 4              // adjust params pointer
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x20, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x8            // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x8            // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x8            // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x20, x12                // if a3 == zero
+        ADD     x20, x20, x8            // a3 += a_offset
+        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 8 bytes for main loop?
+        SUBS    x0, x2, 8               // k = kc - 8
+        B.LO    4f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+2:
+        LD1     {v0.8b}, [x13], 8
+        LDR     d5, [x5], 8
+        LD1     {v1.8b}, [x14], 8
+        LD1     {v2.8b}, [x15], 8
+        LD1     {v3.8b}, [x20], 8
+        ${XXTL}    v0.8h, v0.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        ${XXTL}    v1.8h, v1.8b
+        ${XXTL}    v2.8h, v2.8b
+        ${XXTL}    v3.8h, v3.8b
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x13, 128]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x14, 128]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x15, 128]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x20, 128]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x5, 448]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x5, 512]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    2b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 4f
+
+3:
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(${XINT8_T}*)
+        B.HI    1b
+
+        $if REQUANTIZATION == "RNDNU":
+          # Apply params - preshift, scale, postshift, bias and clamp
+          LD1R    {v4.4s}, [x11], 4
+          SQSHL   v24.4s, v24.4s, v4.4s   // shift to upper bits
+          SQSHL   v25.4s, v25.4s, v4.4s
+          SQSHL   v26.4s, v26.4s, v4.4s
+          SQSHL   v27.4s, v27.4s, v4.4s
+          LD1R    {v5.4s}, [x11], 4
+          SQSHL   v28.4s, v28.4s, v4.4s
+          SQSHL   v29.4s, v29.4s, v4.4s
+          SQSHL   v30.4s, v30.4s, v4.4s
+          SQSHL   v31.4s, v31.4s, v4.4s
+          LD1R    {v6.4s}, [x11], 4
+          SQDMULH v24.4s, v24.4s, v5.4s   // scale without rounding
+          SQDMULH v25.4s, v25.4s, v5.4s
+          SQDMULH v26.4s, v26.4s, v5.4s
+          SQDMULH v27.4s, v27.4s, v5.4s
+          SQDMULH v28.4s, v28.4s, v5.4s
+          SQDMULH v29.4s, v29.4s, v5.4s
+          SQDMULH v30.4s, v30.4s, v5.4s
+          SQDMULH v31.4s, v31.4s, v5.4s
+          SRSHL   v24.4s, v24.4s, v6.4s   // signed rounding shift left
+          SRSHL   v25.4s, v25.4s, v6.4s
+          SRSHL   v26.4s, v26.4s, v6.4s
+          SRSHL   v27.4s, v27.4s, v6.4s
+          SRSHL   v28.4s, v28.4s, v6.4s
+          SRSHL   v29.4s, v29.4s, v6.4s
+          SRSHL   v30.4s, v30.4s, v6.4s
+          SRSHL   v31.4s, v31.4s, v6.4s
+        $elif REQUANTIZATION == "FP32":
+          SCVTF   v24.4s, v24.4s
+          SCVTF   v25.4s, v25.4s
+          $if not CHANNELWISE:
+            # Apply params - scale, bias and clamp
+            LD1R    {v4.4s}, [x11], 4
+            SCVTF   v26.4s, v26.4s
+            SCVTF   v27.4s, v27.4s
+          $else:
+            # Load per channel scale values from weights
+            LDR     q4, [x5], 16
+            SCVTF   v26.4s, v26.4s
+            SCVTF   v27.4s, v27.4s
+            LDR     q5, [x5], 16
+          SCVTF   v28.4s, v28.4s
+          SCVTF   v29.4s, v29.4s
+          SCVTF   v30.4s, v30.4s
+          SCVTF   v31.4s, v31.4s
+
+          $if CHANNELWISE:
+            LDR     q6, [x5], 16
+            FMUL    v24.4s, v24.4s, v6.4s
+            FMUL    v25.4s, v25.4s, v6.4s
+            FMUL    v26.4s, v26.4s, v6.4s
+            FMUL    v27.4s, v27.4s, v6.4s
+            LDR     q4, [x5], 16
+            FMUL    v28.4s, v28.4s, v4.4s
+            FMUL    v29.4s, v29.4s, v4.4s
+            FMUL    v30.4s, v30.4s, v4.4s
+            FMUL    v31.4s, v31.4s, v4.4s
+          $else:
+            FMUL    v24.4s, v24.4s, v4.4s
+            FMUL    v25.4s, v25.4s, v4.4s
+            FMUL    v26.4s, v26.4s, v4.4s
+            FMUL    v27.4s, v27.4s, v4.4s
+            FMUL    v28.4s, v28.4s, v4.4s
+            FMUL    v29.4s, v29.4s, v4.4s
+            FMUL    v30.4s, v30.4s, v4.4s
+            FMUL    v31.4s, v31.4s, v4.4s
+
+          FCVTNS  v23.4s, v23.4s
+          FCVTNS  v24.4s, v24.4s
+          FCVTNS  v25.4s, v25.4s
+          FCVTNS  v26.4s, v26.4s
+          FCVTNS  v27.4s, v27.4s
+          FCVTNS  v28.4s, v28.4s
+          FCVTNS  v29.4s, v29.4s
+          FCVTNS  v30.4s, v30.4s
+          FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+        LD1R    {v4.8b}, [x11], 1       // clamp min value
+
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v5.8b}, [x11]          // clamp max value
+        ${SQXTXN}  v0.8b, v24.8h
+        ${SQXTXN}  v1.8b, v25.8h
+        ${SQXTXN}  v2.8b, v26.8h
+        ${SQXTXN}  v3.8b, v27.8h
+        SUB     x11, x11, ${REWIND_DECREMENT}             // rewind params pointer
+
+        ${XMAX}    v0.8b, v0.8b, v4.8b
+        ${XMAX}    v1.8b, v1.8b, v4.8b
+        ${XMAX}    v2.8b, v2.8b, v4.8b
+        ${XMAX}    v3.8b, v3.8b, v4.8b
+        SUBS    x1, x1, 8
+        ${XMIN}    v0.8b, v0.8b, v5.8b
+        ${XMIN}    v1.8b, v1.8b, v5.8b
+        ${XMIN}    v2.8b, v2.8b, v5.8b
+        ${XMIN}    v3.8b, v3.8b, v5.8b
+        B.LO    5f
+
+        # Store full 4 x 8
+        ST1     {v3.8b},  [x7], x10
+        ST1     {v2.8b}, [x17], x10
+        ST1     {v1.8b}, [x16], x10
+        ST1     {v0.8b},  [x6], x10
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+4:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b}, [x13], x0
+        LDR     d5, [x5], 8
+        LD1     {v1.8b}, [x14], x0
+        LD1     {v2.8b}, [x15], x0
+        LD1     {v3.8b}, [x20], x0
+        ${XXTL}    v0.8h, v0.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        ${XXTL}    v1.8h, v1.8b
+        ${XXTL}    v2.8h, v2.8b
+        ${XXTL}    v3.8h, v3.8b
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    3b
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    3b
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    3b
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    3b
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    3b
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    3b
+
+        LDR     d5, [x5], 8
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 2, 6f
+        STR     s3, [x7], 4
+        STR     s2, [x17], 4
+        DUP     s3, v3.s[1]
+        DUP     s2, v2.s[1]
+        STR     s1, [x16], 4
+        STR     s0, [x6], 4
+        DUP     s1, v1.s[1]
+        DUP     s0, v0.s[1]
+6:
+        TBZ     x1, 1, 7f
+        STR     h3, [x7], 2
+        STR     h2, [x17], 2
+        DUP     h3, v3.h[1]
+        DUP     h2, v2.h[1]
+        STR     h1, [x16], 2
+        STR     h0, [x6], 2
+        DUP     h1, v1.h[1]
+        DUP     h0, v0.h[1]
+7:
+        TBZ     x1, 0, 8f
+        STR     b3, [x7]
+        STR     b2, [x17]
+        STR     b1, [x16]
+        STR     b0, [x6]
+8:
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
+        RET
+
+END_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
index 42794c2..dfcdb5e 100644
--- a/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -31,13 +31,12 @@
 # A1  x14  v1
 # A2  x15  v2
 # A3  x20  v3
-# B    x5  v4  v5  v6
+# B    x5  v4  v5
 # C0   x6 v16 v20 v24 v28
 # C1  x16 v17 v21 v25 v29
 # C2  x17 v18 v22 v26 v30
 # C3   x7 v19 v23 v27 v31
 # unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
 
@@ -53,7 +52,7 @@
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
         CMP     x0, 4                   // if mr < 4
-        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        STR     x20, [sp, -16]!         // Save x20 on stack
         ADD     x7,  x17, x7            // c3 = c2 + cm_stride
         CSEL    x7,  x17, x7, LO        //   c3 = c2
 
@@ -398,8 +397,8 @@
         # nc loop
         B.HI    0b
 
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
         # Remainder- 1 to 7 bytes of A
@@ -604,8 +603,8 @@
         STR     b1, [x16]
         STR     b0, [x6]
 9:
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
 END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
diff --git a/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
index b9cda9e..6c9bfb9 100644
--- a/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -31,13 +31,12 @@
 # A1  x14  v1
 # A2  x15  v2
 # A3  x20  v3
-# B    x5  v4  v5  v6
+# B    x5  v4  v5
 # C0   x6 v16 v20 v24 v28
 # C1  x16 v17 v21 v25 v29
 # C2  x17 v18 v22 v26 v30
 # C3   x7 v19 v23 v27 v31
 # unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
 
@@ -53,7 +52,7 @@
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
         CMP     x0, 4                   // if mr < 4
-        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        STR     x20, [sp, -16]!         // Save x20 on stack
         ADD     x7,  x17, x7            // c3 = c2 + cm_stride
         CSEL    x7,  x17, x7, LO        //   c3 = c2
 
@@ -404,8 +403,8 @@
         # nc loop
         B.HI    0b
 
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
         # Remainder- 1 to 7 bytes of A
@@ -610,8 +609,8 @@
         STR     b1, [x16]
         STR     b0, [x6]
 9:
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
 END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
diff --git a/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
index 2502537..eeee346 100644
--- a/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -31,13 +31,12 @@
 # A1  x14  v1
 # A2  x15  v2
 # A3  x20  v3
-# B    x5  v4  v5  v6
+# B    x5  v4  v5
 # C0   x6 v16 v20 v24 v28
 # C1  x16 v17 v21 v25 v29
 # C2  x17 v18 v22 v26 v30
 # C3   x7 v19 v23 v27 v31
 # unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
 
@@ -53,7 +52,7 @@
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
         CMP     x0, 4                   // if mr < 4
-        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        STR     x20, [sp, -16]!         // Save x20 on stack
         ADD     x7,  x17, x7            // c3 = c2 + cm_stride
         CSEL    x7,  x17, x7, LO        //   c3 = c2
 
@@ -398,8 +397,8 @@
         # nc loop
         B.HI    0b
 
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
         # Remainder- 1 to 7 bytes of A
@@ -604,8 +603,8 @@
         STR     b1, [x16]
         STR     b0, [x6]
 9:
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
 END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
diff --git a/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
index 70f0419..fc73819 100644
--- a/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -31,13 +31,12 @@
 # A1  x14  v1
 # A2  x15  v2
 # A3  x20  v3
-# B    x5  v4  v5  v6
+# B    x5  v4  v5
 # C0   x6 v16 v20 v24 v28
 # C1  x16 v17 v21 v25 v29
 # C2  x17 v18 v22 v26 v30
 # C3   x7 v19 v23 v27 v31
 # unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
 
@@ -53,7 +52,7 @@
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
         CMP     x0, 4                   // if mr < 4
-        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        STR     x20, [sp, -16]!         // Save x20 on stack
         ADD     x7,  x17, x7            // c3 = c2 + cm_stride
         CSEL    x7,  x17, x7, LO        //   c3 = c2
 
@@ -404,8 +403,8 @@
         # nc loop
         B.HI    0b
 
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
         # Remainder- 1 to 7 bytes of A
@@ -610,8 +609,8 @@
         STR     b1, [x16]
         STR     b0, [x6]
 9:
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
 END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
diff --git a/src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..ed98da2
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,420 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t** restrict a, x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x8
+#     const int8_t* zero,                [sp + 16] -> x12
+#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0
+# A1  x14  v1
+# A2  x15  v2
+# A3  x20  v3
+# B    x5  v5
+# C0   x6 v24 v28
+# C1  x16 v25 v29
+# C2  x17 v26 v30
+# C3   x7 v27 v31
+# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        STR     x20, [sp, -16]!         // Save x20 on stack
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q24, q28, [x5], 32
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x20, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x8            // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x8            // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x8            // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x20, x12                // if a3 == zero
+        ADD     x20, x20, x8            // a3 += a_offset
+        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 8 bytes for main loop?
+        SUBS    x0, x2, 8               // k = kc - 8
+        B.LO    4f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+2:
+        LD1     {v0.8b}, [x13], 8
+        LDR     d5, [x5], 8
+        LD1     {v1.8b}, [x14], 8
+        LD1     {v2.8b}, [x15], 8
+        LD1     {v3.8b}, [x20], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    2b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 4f
+
+3:
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        # Apply params - preshift, scale, postshift, bias and clamp
+        LD1R    {v4.4s}, [x11], 4
+        SQSHL   v24.4s, v24.4s, v4.4s   // shift to upper bits
+        SQSHL   v25.4s, v25.4s, v4.4s
+        SQSHL   v26.4s, v26.4s, v4.4s
+        SQSHL   v27.4s, v27.4s, v4.4s
+        LD1R    {v5.4s}, [x11], 4
+        SQSHL   v28.4s, v28.4s, v4.4s
+        SQSHL   v29.4s, v29.4s, v4.4s
+        SQSHL   v30.4s, v30.4s, v4.4s
+        SQSHL   v31.4s, v31.4s, v4.4s
+        LD1R    {v6.4s}, [x11], 4
+        SQDMULH v24.4s, v24.4s, v5.4s   // scale without rounding
+        SQDMULH v25.4s, v25.4s, v5.4s
+        SQDMULH v26.4s, v26.4s, v5.4s
+        SQDMULH v27.4s, v27.4s, v5.4s
+        SQDMULH v28.4s, v28.4s, v5.4s
+        SQDMULH v29.4s, v29.4s, v5.4s
+        SQDMULH v30.4s, v30.4s, v5.4s
+        SQDMULH v31.4s, v31.4s, v5.4s
+        SRSHL   v24.4s, v24.4s, v6.4s   // signed rounding shift left
+        SRSHL   v25.4s, v25.4s, v6.4s
+        SRSHL   v26.4s, v26.4s, v6.4s
+        SRSHL   v27.4s, v27.4s, v6.4s
+        SRSHL   v28.4s, v28.4s, v6.4s
+        SRSHL   v29.4s, v29.4s, v6.4s
+        SRSHL   v30.4s, v30.4s, v6.4s
+        SRSHL   v31.4s, v31.4s, v6.4s
+
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+        LD1R    {v4.8b}, [x11], 1       // clamp min value
+
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v5.8b}, [x11]          // clamp max value
+        SQXTN  v0.8b, v24.8h
+        SQXTN  v1.8b, v25.8h
+        SQXTN  v2.8b, v26.8h
+        SQXTN  v3.8b, v27.8h
+        SUB     x11, x11, 15             // rewind params pointer
+
+        SMAX    v0.8b, v0.8b, v4.8b
+        SMAX    v1.8b, v1.8b, v4.8b
+        SMAX    v2.8b, v2.8b, v4.8b
+        SMAX    v3.8b, v3.8b, v4.8b
+        SUBS    x1, x1, 8
+        SMIN    v0.8b, v0.8b, v5.8b
+        SMIN    v1.8b, v1.8b, v5.8b
+        SMIN    v2.8b, v2.8b, v5.8b
+        SMIN    v3.8b, v3.8b, v5.8b
+        B.LO    5f
+
+        # Store full 4 x 8
+        ST1     {v3.8b},  [x7], x10
+        ST1     {v2.8b}, [x17], x10
+        ST1     {v1.8b}, [x16], x10
+        ST1     {v0.8b},  [x6], x10
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+4:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b}, [x13], x0
+        LDR     d5, [x5], 8
+        LD1     {v1.8b}, [x14], x0
+        LD1     {v2.8b}, [x15], x0
+        LD1     {v3.8b}, [x20], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    3b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    3b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    3b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    3b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    3b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    3b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 2, 6f
+        STR     s3, [x7], 4
+        STR     s2, [x17], 4
+        DUP     s3, v3.s[1]
+        DUP     s2, v2.s[1]
+        STR     s1, [x16], 4
+        STR     s0, [x6], 4
+        DUP     s1, v1.s[1]
+        DUP     s0, v0.s[1]
+6:
+        TBZ     x1, 1, 7f
+        STR     h3, [x7], 2
+        STR     h2, [x17], 2
+        DUP     h3, v3.h[1]
+        DUP     h2, v2.h[1]
+        STR     h1, [x16], 2
+        STR     h0, [x6], 2
+        DUP     h1, v1.h[1]
+        DUP     h0, v0.h[1]
+7:
+        TBZ     x1, 0, 8f
+        STR     b3, [x7]
+        STR     b2, [x17]
+        STR     b1, [x16]
+        STR     b0, [x6]
+8:
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..3f0fc8c
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,426 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t** restrict a, x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x8
+#     const int8_t* zero,                [sp + 16] -> x12
+#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0
+# A1  x14  v1
+# A2  x15  v2
+# A3  x20  v3
+# B    x5  v5
+# C0   x6 v24 v28
+# C1  x16 v25 v29
+# C2  x17 v26 v30
+# C3   x7 v27 v31
+# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        STR     x20, [sp, -16]!         // Save x20 on stack
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q24, q28, [x5], 32
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x20, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x8            // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x8            // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x8            // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x20, x12                // if a3 == zero
+        ADD     x20, x20, x8            // a3 += a_offset
+        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 8 bytes for main loop?
+        SUBS    x0, x2, 8               // k = kc - 8
+        B.LO    4f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+2:
+        LD1     {v0.8b}, [x13], 8
+        LDR     d5, [x5], 8
+        LD1     {v1.8b}, [x14], 8
+        LD1     {v2.8b}, [x15], 8
+        LD1     {v3.8b}, [x20], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        PRFM    PLDL1KEEP, [x13, 128]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x14, 128]
+        PRFM    PLDL1KEEP, [x15, 128]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x20, 128]
+        PRFM    PLDL1KEEP, [x5, 448]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 512]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    2b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 4f
+
+3:
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        # Apply params - preshift, scale, postshift, bias and clamp
+        LD1R    {v4.4s}, [x11], 4
+        SQSHL   v24.4s, v24.4s, v4.4s   // shift to upper bits
+        SQSHL   v25.4s, v25.4s, v4.4s
+        SQSHL   v26.4s, v26.4s, v4.4s
+        SQSHL   v27.4s, v27.4s, v4.4s
+        LD1R    {v5.4s}, [x11], 4
+        SQSHL   v28.4s, v28.4s, v4.4s
+        SQSHL   v29.4s, v29.4s, v4.4s
+        SQSHL   v30.4s, v30.4s, v4.4s
+        SQSHL   v31.4s, v31.4s, v4.4s
+        LD1R    {v6.4s}, [x11], 4
+        SQDMULH v24.4s, v24.4s, v5.4s   // scale without rounding
+        SQDMULH v25.4s, v25.4s, v5.4s
+        SQDMULH v26.4s, v26.4s, v5.4s
+        SQDMULH v27.4s, v27.4s, v5.4s
+        SQDMULH v28.4s, v28.4s, v5.4s
+        SQDMULH v29.4s, v29.4s, v5.4s
+        SQDMULH v30.4s, v30.4s, v5.4s
+        SQDMULH v31.4s, v31.4s, v5.4s
+        SRSHL   v24.4s, v24.4s, v6.4s   // signed rounding shift left
+        SRSHL   v25.4s, v25.4s, v6.4s
+        SRSHL   v26.4s, v26.4s, v6.4s
+        SRSHL   v27.4s, v27.4s, v6.4s
+        SRSHL   v28.4s, v28.4s, v6.4s
+        SRSHL   v29.4s, v29.4s, v6.4s
+        SRSHL   v30.4s, v30.4s, v6.4s
+        SRSHL   v31.4s, v31.4s, v6.4s
+
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+        LD1R    {v4.8b}, [x11], 1       // clamp min value
+
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v5.8b}, [x11]          // clamp max value
+        SQXTN  v0.8b, v24.8h
+        SQXTN  v1.8b, v25.8h
+        SQXTN  v2.8b, v26.8h
+        SQXTN  v3.8b, v27.8h
+        SUB     x11, x11, 15             // rewind params pointer
+
+        SMAX    v0.8b, v0.8b, v4.8b
+        SMAX    v1.8b, v1.8b, v4.8b
+        SMAX    v2.8b, v2.8b, v4.8b
+        SMAX    v3.8b, v3.8b, v4.8b
+        SUBS    x1, x1, 8
+        SMIN    v0.8b, v0.8b, v5.8b
+        SMIN    v1.8b, v1.8b, v5.8b
+        SMIN    v2.8b, v2.8b, v5.8b
+        SMIN    v3.8b, v3.8b, v5.8b
+        B.LO    5f
+
+        # Store full 4 x 8
+        ST1     {v3.8b},  [x7], x10
+        ST1     {v2.8b}, [x17], x10
+        ST1     {v1.8b}, [x16], x10
+        ST1     {v0.8b},  [x6], x10
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+4:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b}, [x13], x0
+        LDR     d5, [x5], 8
+        LD1     {v1.8b}, [x14], x0
+        LD1     {v2.8b}, [x15], x0
+        LD1     {v3.8b}, [x20], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    3b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    3b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    3b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    3b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    3b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    3b
+
+        LDR     d5, [x5], 8
+        SXTL    v5.8h, v5.8b
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 2, 6f
+        STR     s3, [x7], 4
+        STR     s2, [x17], 4
+        DUP     s3, v3.s[1]
+        DUP     s2, v2.s[1]
+        STR     s1, [x16], 4
+        STR     s0, [x6], 4
+        DUP     s1, v1.s[1]
+        DUP     s0, v0.s[1]
+6:
+        TBZ     x1, 1, 7f
+        STR     h3, [x7], 2
+        STR     h2, [x17], 2
+        DUP     h3, v3.h[1]
+        DUP     h2, v2.h[1]
+        STR     h1, [x16], 2
+        STR     h0, [x6], 2
+        DUP     h1, v1.h[1]
+        DUP     h0, v0.h[1]
+7:
+        TBZ     x1, 0, 8f
+        STR     b3, [x7]
+        STR     b2, [x17]
+        STR     b1, [x16]
+        STR     b0, [x6]
+8:
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
index 59ecaa4..9f53c93 100644
--- a/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -40,7 +40,7 @@
 # A1 x15 v1
 # A2 x13 v2
 # A3  x4 v3
-# B   x5 v4  v5  v6
+# B   x5 v4  v5
 # C0  x6 v16 v20 v24 v28
 # C1  x8 v17 v21 v25 v29
 # C2  x9 v18 v22 v26 v30
@@ -48,8 +48,6 @@
 # zero_point  v7
 # unused v8 v9 v10 v11 v12 v13 v14 v15
 
-# x10 x17 a53 temp registers
-
 BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
 
         # Clamp A and C pointers
diff --git a/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
index a4bc0d1..642f8fb 100644
--- a/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -40,7 +40,7 @@
 # A1 x15 v1
 # A2 x13 v2
 # A3  x4 v3
-# B   x5 v4  v5  v6
+# B   x5 v4  v5
 # C0  x6 v16 v20 v24 v28
 # C1  x8 v17 v21 v25 v29
 # C2  x9 v18 v22 v26 v30
@@ -48,8 +48,6 @@
 # zero_point  v7
 # unused v8 v9 v10 v11 v12 v13 v14 v15
 
-# x10 x17 a53 temp registers
-
 BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
 
         # Clamp A and C pointers
diff --git a/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
index a7c3e77..1f88d52 100644
--- a/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -42,14 +42,13 @@
 # A1  x14  v1
 # A2  x15  v2
 # A3  x20  v3
-# B    x5  v4  v5  v6
+# B    x5  v4  v5
 # C0   x6 v16 v20 v24 v28
 # C1  x16 v17 v21 v25 v29
 # C2  x17 v18 v22 v26 v30
 # C3   x7 v19 v23 v27 v31
 # zero_point v7
 # unused  v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
 
@@ -65,7 +64,7 @@
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
         CMP     x0, 4                   // if mr < 4
-        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        STR     x20, [sp, -16]!         // Save x20 on stack
         ADD     x7,  x17, x7            // c3 = c2 + cm_stride
         CSEL    x7,  x17, x7, LO        //   c3 = c2
 
@@ -412,8 +411,8 @@
         # nc loop
         B.HI    0b
 
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
         # Remainder- 1 to 7 bytes of A
@@ -618,8 +617,8 @@
         STR     b1, [x16]
         STR     b0, [x6]
 9:
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
 END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
diff --git a/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
index ec7eff2..dc1f6fa 100644
--- a/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -42,14 +42,13 @@
 # A1  x14  v1
 # A2  x15  v2
 # A3  x20  v3
-# B    x5  v4  v5  v6
+# B    x5  v4  v5
 # C0   x6 v16 v20 v24 v28
 # C1  x16 v17 v21 v25 v29
 # C2  x17 v18 v22 v26 v30
 # C3   x7 v19 v23 v27 v31
 # zero_point v7
 # unused  v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
 
 BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
 
@@ -65,7 +64,7 @@
         CSEL    x17, x16, x17, LS       //   c2 = c1
 
         CMP     x0, 4                   // if mr < 4
-        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        STR     x20, [sp, -16]!         // Save x20 on stack
         ADD     x7,  x17, x7            // c3 = c2 + cm_stride
         CSEL    x7,  x17, x7, LO        //   c3 = c2
 
@@ -418,8 +417,8 @@
         # nc loop
         B.HI    0b
 
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
         # Remainder- 1 to 7 bytes of A
@@ -624,8 +623,8 @@
         STR     b1, [x16]
         STR     b0, [x6]
 9:
-        # Restore x20-x21 from stack
-        LDP     x20, x21, [sp], 16
+        # Restore x20 from stack
+        LDR     x20, [sp], 16
         RET
 
 END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index e7b77ab..1da47f6 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -1006,6 +1006,9 @@
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64)
+
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
 
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index c6067b4..b338f8c 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -829,6 +829,9 @@
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64)
+
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
 
diff --git a/test/qs8-gemm-minmax-rndnu.cc b/test/qs8-gemm-minmax-rndnu.cc
index 06aebe6..906d414 100644
--- a/test/qs8-gemm-minmax-rndnu.cc
+++ b/test/qs8-gemm-minmax-rndnu.cc
@@ -22,6 +22,918 @@
 #include "gemm-microkernel-tester.h"
 
 
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
diff --git a/test/qs8-gemm-minmax-rndnu.yaml b/test/qs8-gemm-minmax-rndnu.yaml
index a13d876..9d3ffb6 100644
--- a/test/qs8-gemm-minmax-rndnu.yaml
+++ b/test/qs8-gemm-minmax-rndnu.yaml
@@ -3,6 +3,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64
+  init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64
+  init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+  k-block: 8
 - name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
diff --git a/test/qs8-igemm-minmax-rndnu.cc b/test/qs8-igemm-minmax-rndnu.cc
index a6fef58..fb77800 100644
--- a/test/qs8-igemm-minmax-rndnu.cc
+++ b/test/qs8-igemm-minmax-rndnu.cc
@@ -22,6 +22,942 @@
 #include "gemm-microkernel-tester.h"
 
 
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cn_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 8; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(8)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 8; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(11)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 9; n < 16; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 16; n <= 24; n += 8) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 8; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(8)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(11)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(8)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(8)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(8)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(8)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(8)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(8)
+      .k(8)
+      .cm_stride(11)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
diff --git a/test/qs8-igemm-minmax-rndnu.yaml b/test/qs8-igemm-minmax-rndnu.yaml
index 6bf87ad..c8a3b19 100644
--- a/test/qs8-igemm-minmax-rndnu.yaml
+++ b/test/qs8-igemm-minmax-rndnu.yaml
@@ -3,6 +3,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64
+  init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64
+  init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+  k-block: 8
 - name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8