4x16 lane AArch64 NEON GEMM/IGEMM ld64 microkernel

PiperOrigin-RevId: 411642422
diff --git a/BUILD.bazel b/BUILD.bazel
index 9e3cfcc..e9cb45d 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -6199,6 +6199,18 @@
 ]
 
 AARCH64_ASM_MICROKERNEL_SRCS = [
+    "src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
+    "src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
+    "src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
+    "src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
+    "src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
+    "src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S",
+    "src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
+    "src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
     "src/f16-gemm/gen-inc/1x8inc-minmax-aarch64-neonfp16arith-ld64.S",
     "src/f16-gemm/gen-inc/1x16inc-minmax-aarch64-neonfp16arith-ld32.S",
     "src/f16-gemm/gen-inc/4x8inc-minmax-aarch64-neonfp16arith-ld64.S",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ce1cde9..6bdb274 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5186,6 +5186,18 @@
   src/f32-igemm/gen/4x8-minmax-aarch32-neon-pld-cortex-a75.S)
 
 SET(AARCH64_ASM_MICROKERNEL_SRCS
+  src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+  src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+  src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+  src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+  src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+  src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+  src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+  src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
+  src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+  src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+  src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+  src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
   src/f16-gemm/gen-inc/1x8inc-minmax-aarch64-neonfp16arith-ld64.S
   src/f16-gemm/gen-inc/1x16inc-minmax-aarch64-neonfp16arith-ld32.S
   src/f16-gemm/gen-inc/4x8inc-minmax-aarch64-neonfp16arith-ld64.S
diff --git a/bench/qs8-gemm-e2e.cc b/bench/qs8-gemm-e2e.cc
index a39dc8a..4af2454 100644
--- a/bench/qs8-gemm-e2e.cc
+++ b/bench/qs8-gemm-e2e.cc
@@ -136,6 +136,26 @@
       4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
       benchmark::utils::CheckNEON);
   }
+  static void qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+    GEMMEnd2EndBenchmark(state, model,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
+      xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
+      xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params,
+      4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+      benchmark::utils::CheckNEON);
+  }
   static void qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
     GEMMEnd2EndBenchmark(state, model,
       xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
@@ -183,6 +203,8 @@
   BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld128)
   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
   BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
+  BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+  BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53)
   BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm)
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 4630cc0..d98b51f 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -244,6 +244,14 @@
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, 4, 16, 1, 1,
       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
   }
+  static void qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, 4, 16, 1, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
+  static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
+    GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, 4, 16, 1, 1,
+      xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+  }
   static void qs8_gemm_1x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, const char* net) {
     GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, 1, 8, 8, 1,
       xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
@@ -293,6 +301,8 @@
   BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
   BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+  BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
+  BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
   BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_prfm)
   BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal)
   BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53)
diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh
index 3be3587..973bdff 100755
--- a/scripts/generate-qs8-gemm.sh
+++ b/scripts/generate-qs8-gemm.sh
@@ -607,10 +607,7 @@
 tools/xngen src/qu8-gemm/c4-neondot.c.in -D MR=3  -D NR=32 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -o src/qu8-gemm/gen/3x32c4-minmax-rndnu-neondot.c &
 
 ############################### AArch64 assembly ##############################
-# Cortex A53 micro-kernel
-tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S &
-tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
-
+### Cortex-A53 lane micro-kernels
 tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
 tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
 
@@ -620,7 +617,19 @@
 tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S &
 tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
 
-# QU8 micro-kernels
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S &
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
+
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=0 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=1 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S &
+
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=0 -D REQUANTIZATION=FP32     -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=1 -D REQUANTIZATION=FP32     -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S &
+
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=0 -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=1 -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S &
+
+### QU8 micro-kernels
 tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QU8 -o src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
 tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QU8 -o src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
 
diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh
index d018881..6355a06 100755
--- a/scripts/generate-qs8-igemm.sh
+++ b/scripts/generate-qs8-igemm.sh
@@ -613,10 +613,7 @@
 tools/xngen src/qu8-igemm/c4-neondot.c.in -D MR=3  -D NR=32 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -o src/qu8-igemm/gen/3x32c4-minmax-rndnu-neondot.c &
 
 ############################### AArch64 assembly ##############################
-# Cortex A53 micro-kernel
-tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S &
-tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
-
+### Cortex-A53 lane micro-kernels
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
 
@@ -626,13 +623,26 @@
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S &
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
 
-# QU8 micro-kernels
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S &
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
+
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=0 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=1 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S &
+
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=0 -D REQUANTIZATION=FP32     -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=1 -D REQUANTIZATION=FP32     -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S &
+
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=0 -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in   -D PREFETCH=1 -D REQUANTIZATION=FP32     -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S &
+
+### QU8 micro-kernels
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=0 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QU8 -o src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in   -D PREFETCH=1 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QU8 -o src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
 
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a75.S.in   -D PREFETCH=0 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QU8 -o src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S &
 tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a75.S.in   -D PREFETCH=1 -D REQUANTIZATION=RNDNU    -D CHANNELWISE=0 -D DATATYPE=QU8 -o src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S &
 
+
 ### C4 micro-kernels
 tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S &
 tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in       -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S &
diff --git a/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..f9848c0
--- /dev/null
+++ b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,590 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v4  v5  v6
+# C0  x6 v16 v20 v24 v28
+# C1  x8 v17 v21 v25 v29
+# C2  x9 v18 v22 v26 v30
+# C3  x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x12, x11, [sp]          // Load cn_stride, params
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        SUBS    x0, x2, 8               // k = kc - 8
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        # Is there at least 8 bytes for main loop?
+        B.LO    3f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LD1     {v0.8b},  [x3], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], 8
+        LD1     {v2.8b}, [x13], 8
+        LD1     {v3.8b},  [x4], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    1b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 3f
+
+2:
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        # Load per channel scale values from weights
+        LDR     q4, [x5], 16
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        LDR     q5, [x5], 16
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        LDR     q6, [x5], 16
+        FMUL    v16.4s, v16.4s, v4.4s
+        FMUL    v17.4s, v17.4s, v4.4s
+        FMUL    v18.4s, v18.4s, v4.4s
+        FMUL    v19.4s, v19.4s, v4.4s
+        FMUL    v20.4s, v20.4s, v5.4s
+        LDR     q4, [x5], 16
+        FMUL    v21.4s, v21.4s, v5.4s
+        FMUL    v22.4s, v22.4s, v5.4s
+        FMUL    v23.4s, v23.4s, v5.4s
+        FMUL    v24.4s, v24.4s, v6.4s
+        FMUL    v25.4s, v25.4s, v6.4s
+        FMUL    v26.4s, v26.4s, v6.4s
+        FMUL    v27.4s, v27.4s, v6.4s
+        FMUL    v28.4s, v28.4s, v4.4s
+        FMUL    v29.4s, v29.4s, v4.4s
+        FMUL    v30.4s, v30.4s, v4.4s
+        FMUL    v31.4s, v31.4s, v4.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1      // clamp min value
+
+        SQXTN   v0.8b, v16.8h
+        SQXTN   v1.8b, v17.8h
+        SQXTN   v2.8b, v18.8h
+        SQXTN   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]         // clamp max value
+        SQXTN2  v0.16b, v24.8h
+        SQXTN2  v1.16b, v25.8h
+        SQXTN2  v2.16b, v26.8h
+        SQXTN2  v3.16b, v27.8h
+        SUB     x11, x11, 3             // rewind params pointer
+
+        SMAX    v0.16b, v0.16b, v4.16b
+        SMAX    v1.16b, v1.16b, v4.16b
+        SMAX    v2.16b, v2.16b, v4.16b
+        SMAX    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        SMIN    v0.16b, v0.16b, v5.16b
+        SMIN    v1.16b, v1.16b, v5.16b
+        SMIN    v2.16b, v2.16b, v5.16b
+        SMIN    v3.16b, v3.16b, v5.16b
+        B.LO    4f
+
+        # Store full 4 x 16
+        ST1     {v0.16b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v1.16b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v2.16b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v3.16b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+3:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b},  [x3], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], x0
+        LD1     {v2.8b}, [x13], x0
+        LD1     {v3.8b},  [x4], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       2b
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 3, 5f
+        STR     d0, [x6], 8
+        STR     d1, [x8], 8
+        DUP     d0, v0.d[1]
+        DUP     d1, v1.d[1]
+        STR     d2, [x9], 8
+        STR     d3, [x7], 8
+        DUP     d2, v2.d[1]
+        DUP     d3, v3.d[1]
+5:
+        TBZ     x1, 2, 6f
+        STR     s0, [x6], 4
+        STR     s1, [x8], 4
+        DUP     s0, v0.s[1]
+        DUP     s1, v1.s[1]
+        STR     s2, [x9], 4
+        STR     s3, [x7], 4
+        DUP     s2, v2.s[1]
+        DUP     s3, v3.s[1]
+6:
+        TBZ     x1, 1, 7f
+        STR     h0, [x6], 2
+        STR     h1, [x8], 2
+        DUP     h0, v0.h[1]
+        DUP     h1, v1.h[1]
+        STR     h2, [x9], 2
+        STR     h3, [x7], 2
+        DUP     h2, v2.h[1]
+        DUP     h3, v3.h[1]
+7:
+        TBZ     x1, 0, 8f
+        STR     b0, [x6]
+        STR     b1, [x8]
+        STR     b2, [x9]
+        STR     b3, [x7]
+8:
+        RET
+
+END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..1c45866
--- /dev/null
+++ b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,596 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v4  v5  v6
+# C0  x6 v16 v20 v24 v28
+# C1  x8 v17 v21 v25 v29
+# C2  x9 v18 v22 v26 v30
+# C3  x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x12, x11, [sp]          // Load cn_stride, params
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        SUBS    x0, x2, 8               // k = kc - 8
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        # Is there at least 8 bytes for main loop?
+        B.LO    3f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LD1     {v0.8b},  [x3], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], 8
+        LD1     {v2.8b}, [x13], 8
+        LD1     {v3.8b},  [x4], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x13, 128]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x15, 128]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x3, 128]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x4, 128]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 448]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 512]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    1b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 3f
+
+2:
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        # Load per channel scale values from weights
+        LDR     q4, [x5], 16
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        LDR     q5, [x5], 16
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        LDR     q6, [x5], 16
+        FMUL    v16.4s, v16.4s, v4.4s
+        FMUL    v17.4s, v17.4s, v4.4s
+        FMUL    v18.4s, v18.4s, v4.4s
+        FMUL    v19.4s, v19.4s, v4.4s
+        FMUL    v20.4s, v20.4s, v5.4s
+        LDR     q4, [x5], 16
+        FMUL    v21.4s, v21.4s, v5.4s
+        FMUL    v22.4s, v22.4s, v5.4s
+        FMUL    v23.4s, v23.4s, v5.4s
+        FMUL    v24.4s, v24.4s, v6.4s
+        FMUL    v25.4s, v25.4s, v6.4s
+        FMUL    v26.4s, v26.4s, v6.4s
+        FMUL    v27.4s, v27.4s, v6.4s
+        FMUL    v28.4s, v28.4s, v4.4s
+        FMUL    v29.4s, v29.4s, v4.4s
+        FMUL    v30.4s, v30.4s, v4.4s
+        FMUL    v31.4s, v31.4s, v4.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1      // clamp min value
+
+        SQXTN   v0.8b, v16.8h
+        SQXTN   v1.8b, v17.8h
+        SQXTN   v2.8b, v18.8h
+        SQXTN   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]         // clamp max value
+        SQXTN2  v0.16b, v24.8h
+        SQXTN2  v1.16b, v25.8h
+        SQXTN2  v2.16b, v26.8h
+        SQXTN2  v3.16b, v27.8h
+        SUB     x11, x11, 3             // rewind params pointer
+
+        SMAX    v0.16b, v0.16b, v4.16b
+        SMAX    v1.16b, v1.16b, v4.16b
+        SMAX    v2.16b, v2.16b, v4.16b
+        SMAX    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        SMIN    v0.16b, v0.16b, v5.16b
+        SMIN    v1.16b, v1.16b, v5.16b
+        SMIN    v2.16b, v2.16b, v5.16b
+        SMIN    v3.16b, v3.16b, v5.16b
+        B.LO    4f
+
+        # Store full 4 x 16
+        ST1     {v0.16b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v1.16b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v2.16b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v3.16b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+3:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b},  [x3], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], x0
+        LD1     {v2.8b}, [x13], x0
+        LD1     {v3.8b},  [x4], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       2b
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 3, 5f
+        STR     d0, [x6], 8
+        STR     d1, [x8], 8
+        DUP     d0, v0.d[1]
+        DUP     d1, v1.d[1]
+        STR     d2, [x9], 8
+        STR     d3, [x7], 8
+        DUP     d2, v2.d[1]
+        DUP     d3, v3.d[1]
+5:
+        TBZ     x1, 2, 6f
+        STR     s0, [x6], 4
+        STR     s1, [x8], 4
+        DUP     s0, v0.s[1]
+        DUP     s1, v1.s[1]
+        STR     s2, [x9], 4
+        STR     s3, [x7], 4
+        DUP     s2, v2.s[1]
+        DUP     s3, v3.s[1]
+6:
+        TBZ     x1, 1, 7f
+        STR     h0, [x6], 2
+        STR     h1, [x8], 2
+        DUP     h0, v0.h[1]
+        DUP     h1, v1.h[1]
+        STR     h2, [x9], 2
+        STR     h3, [x7], 2
+        DUP     h2, v2.h[1]
+        DUP     h3, v3.h[1]
+7:
+        TBZ     x1, 0, 8f
+        STR     b0, [x6]
+        STR     b1, [x8]
+        STR     b2, [x9]
+        STR     b3, [x7]
+8:
+        RET
+
+END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..ea14f76
--- /dev/null
+++ b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,618 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t** restrict a, x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x8
+#     const int8_t* zero,                [sp + 16] -> x12
+#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0
+# A1  x14  v1
+# A2  x15  v2
+# A3  x20  v3
+# B    x5  v4  v5  v6
+# C0   x6 v16 v20 v24 v28
+# C1  x16 v17 v21 v25 v29
+# C2  x17 v18 v22 v26 v30
+# C3   x7 v19 v23 v27 v31
+# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x20, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x8            // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x8            // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x8            // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x20, x12                // if a3 == zero
+        ADD     x20, x20, x8            // a3 += a_offset
+        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 8 bytes for main loop?
+        SUBS    x0, x2, 8               // k = kc - 8
+        B.LO    4f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+2:
+        LD1     {v0.8b}, [x13], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], 8
+        LD1     {v2.8b}, [x15], 8
+        LD1     {v3.8b}, [x20], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    2b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 4f
+
+3:
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        # Load per channel scale values from weights
+        LDR     q4, [x5], 16
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        LDR     q5, [x5], 16
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        LDR     q6, [x5], 16
+        FMUL    v16.4s, v16.4s, v4.4s
+        FMUL    v17.4s, v17.4s, v4.4s
+        FMUL    v18.4s, v18.4s, v4.4s
+        FMUL    v19.4s, v19.4s, v4.4s
+        FMUL    v20.4s, v20.4s, v5.4s
+        LDR     q4, [x5], 16
+        FMUL    v21.4s, v21.4s, v5.4s
+        FMUL    v22.4s, v22.4s, v5.4s
+        FMUL    v23.4s, v23.4s, v5.4s
+        FMUL    v24.4s, v24.4s, v6.4s
+        FMUL    v25.4s, v25.4s, v6.4s
+        FMUL    v26.4s, v26.4s, v6.4s
+        FMUL    v27.4s, v27.4s, v6.4s
+        FMUL    v28.4s, v28.4s, v4.4s
+        FMUL    v29.4s, v29.4s, v4.4s
+        FMUL    v30.4s, v30.4s, v4.4s
+        FMUL    v31.4s, v31.4s, v4.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2        // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1       // clamp min value
+
+        SQXTN   v0.8b, v16.8h
+        SQXTN   v1.8b, v17.8h
+        SQXTN   v2.8b, v18.8h
+        SQXTN   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]          // clamp max value
+        SQXTN2  v0.16b, v24.8h
+        SQXTN2  v1.16b, v25.8h
+        SQXTN2  v2.16b, v26.8h
+        SQXTN2  v3.16b, v27.8h
+        SUB     x11, x11, 3             // rewind params pointer
+
+        SMAX    v0.16b, v0.16b, v4.16b
+        SMAX    v1.16b, v1.16b, v4.16b
+        SMAX    v2.16b, v2.16b, v4.16b
+        SMAX    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        SMIN    v0.16b, v0.16b, v5.16b
+        SMIN    v1.16b, v1.16b, v5.16b
+        SMIN    v2.16b, v2.16b, v5.16b
+        SMIN    v3.16b, v3.16b, v5.16b
+        B.LO    5f
+
+        # Store full 4 x 16
+        ST1     {v3.16b},  [x7], x10
+        ST1     {v2.16b}, [x17], x10
+        ST1     {v1.16b}, [x16], x10
+        ST1     {v0.16b},  [x6], x10
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+4:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b}, [x13], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], x0
+        LD1     {v2.8b}, [x15], x0
+        LD1     {v3.8b}, [x20], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 3, 6f
+        STR     d3, [x7], 8
+        STR     d2, [x17], 8
+        DUP     d3, v3.d[1]
+        DUP     d2, v2.d[1]
+        STR     d1, [x16], 8
+        STR     d0, [x6], 8
+        DUP     d1, v1.d[1]
+        DUP     d0, v0.d[1]
+6:
+        TBZ     x1, 2, 7f
+        STR     s3, [x7], 4
+        STR     s2, [x17], 4
+        DUP     s3, v3.s[1]
+        DUP     s2, v2.s[1]
+        STR     s1, [x16], 4
+        STR     s0, [x6], 4
+        DUP     s1, v1.s[1]
+        DUP     s0, v0.s[1]
+7:
+        TBZ     x1, 1, 8f
+        STR     h3, [x7], 2
+        STR     h2, [x17], 2
+        DUP     h3, v3.h[1]
+        DUP     h2, v2.h[1]
+        STR     h1, [x16], 2
+        STR     h0, [x6], 2
+        DUP     h1, v1.h[1]
+        DUP     h0, v0.h[1]
+8:
+        TBZ     x1, 0, 9f
+        STR     b3, [x7]
+        STR     b2, [x17]
+        STR     b1, [x16]
+        STR     b0, [x6]
+9:
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..908e363
--- /dev/null
+++ b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,624 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t** restrict a, x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x8
+#     const int8_t* zero,                [sp + 16] -> x12
+#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0
+# A1  x14  v1
+# A2  x15  v2
+# A3  x20  v3
+# B    x5  v4  v5  v6
+# C0   x6 v16 v20 v24 v28
+# C1  x16 v17 v21 v25 v29
+# C2  x17 v18 v22 v26 v30
+# C3   x7 v19 v23 v27 v31
+# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x20, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x8            // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x8            // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x8            // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x20, x12                // if a3 == zero
+        ADD     x20, x20, x8            // a3 += a_offset
+        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 8 bytes for main loop?
+        SUBS    x0, x2, 8               // k = kc - 8
+        B.LO    4f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+2:
+        LD1     {v0.8b}, [x13], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], 8
+        LD1     {v2.8b}, [x15], 8
+        LD1     {v3.8b}, [x20], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x13, 128]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x14, 128]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x15, 128]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x20, 128]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 448]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 512]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    2b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 4f
+
+3:
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        # Load per channel scale values from weights
+        LDR     q4, [x5], 16
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        LDR     q5, [x5], 16
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        LDR     q6, [x5], 16
+        FMUL    v16.4s, v16.4s, v4.4s
+        FMUL    v17.4s, v17.4s, v4.4s
+        FMUL    v18.4s, v18.4s, v4.4s
+        FMUL    v19.4s, v19.4s, v4.4s
+        FMUL    v20.4s, v20.4s, v5.4s
+        LDR     q4, [x5], 16
+        FMUL    v21.4s, v21.4s, v5.4s
+        FMUL    v22.4s, v22.4s, v5.4s
+        FMUL    v23.4s, v23.4s, v5.4s
+        FMUL    v24.4s, v24.4s, v6.4s
+        FMUL    v25.4s, v25.4s, v6.4s
+        FMUL    v26.4s, v26.4s, v6.4s
+        FMUL    v27.4s, v27.4s, v6.4s
+        FMUL    v28.4s, v28.4s, v4.4s
+        FMUL    v29.4s, v29.4s, v4.4s
+        FMUL    v30.4s, v30.4s, v4.4s
+        FMUL    v31.4s, v31.4s, v4.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2        // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1       // clamp min value
+
+        SQXTN   v0.8b, v16.8h
+        SQXTN   v1.8b, v17.8h
+        SQXTN   v2.8b, v18.8h
+        SQXTN   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]          // clamp max value
+        SQXTN2  v0.16b, v24.8h
+        SQXTN2  v1.16b, v25.8h
+        SQXTN2  v2.16b, v26.8h
+        SQXTN2  v3.16b, v27.8h
+        SUB     x11, x11, 3             // rewind params pointer
+
+        SMAX    v0.16b, v0.16b, v4.16b
+        SMAX    v1.16b, v1.16b, v4.16b
+        SMAX    v2.16b, v2.16b, v4.16b
+        SMAX    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        SMIN    v0.16b, v0.16b, v5.16b
+        SMIN    v1.16b, v1.16b, v5.16b
+        SMIN    v2.16b, v2.16b, v5.16b
+        SMIN    v3.16b, v3.16b, v5.16b
+        B.LO    5f
+
+        # Store full 4 x 16
+        ST1     {v3.16b},  [x7], x10
+        ST1     {v2.16b}, [x17], x10
+        ST1     {v1.16b}, [x16], x10
+        ST1     {v0.16b},  [x6], x10
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+4:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b}, [x13], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], x0
+        LD1     {v2.8b}, [x15], x0
+        LD1     {v3.8b}, [x20], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 3, 6f
+        STR     d3, [x7], 8
+        STR     d2, [x17], 8
+        DUP     d3, v3.d[1]
+        DUP     d2, v2.d[1]
+        STR     d1, [x16], 8
+        STR     d0, [x6], 8
+        DUP     d1, v1.d[1]
+        DUP     d0, v0.d[1]
+6:
+        TBZ     x1, 2, 7f
+        STR     s3, [x7], 4
+        STR     s2, [x17], 4
+        DUP     s3, v3.s[1]
+        DUP     s2, v2.s[1]
+        STR     s1, [x16], 4
+        STR     s0, [x6], 4
+        DUP     s1, v1.s[1]
+        DUP     s0, v0.s[1]
+7:
+        TBZ     x1, 1, 8f
+        STR     h3, [x7], 2
+        STR     h2, [x17], 2
+        DUP     h3, v3.h[1]
+        DUP     h2, v2.h[1]
+        STR     h1, [x16], 2
+        STR     h0, [x6], 2
+        DUP     h1, v1.h[1]
+        DUP     h0, v0.h[1]
+8:
+        TBZ     x1, 0, 9f
+        STR     b3, [x7]
+        STR     b2, [x17]
+        STR     b1, [x16]
+        STR     b0, [x6]
+9:
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in b/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
new file mode 100644
index 0000000..6d31b42
--- /dev/null
+++ b/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
@@ -0,0 +1,881 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert REQUANTIZATION in ["FP32", "GEMMLOWP", "RNDNU"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
+$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
+$assert DATATYPE != "QU8" or REQUANTIZATION == "RNDNU"
+
+#include <xnnpack/assembly.h>
+
+$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+$if DATATYPE == "QU8":
+  $REWIND_DECREMENT = 15
+$else:
+  $REWIND_DECREMENT = 3 if CHANNELWISE else {"GEMMLOWP": 11, "RNDNU": 15, "FP32": 7}[REQUANTIZATION]
+$XMIN = "UMIN" if DATATYPE == "QU8" else "SMIN"
+$XMAX = "UMAX" if DATATYPE == "QU8" else "SMAX"
+$XXTL = "UXTL" if DATATYPE == "QU8" else "SXTL"
+$SQXTXN = "SQXTUN" if DATATYPE == "QU8" else "SQXTN"
+$SQXTXN2 = "SQXTUN2" if DATATYPE == "QU8" else "SQXTN2"
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
+# void xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const ${XINT8_T}* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     ${XINT8_T}* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+
+$if REQUANTIZATION == "RNDNU" and DATATYPE == "QU8":
+  # params structure is 20 bytes
+  #  struct {
+  #    ${XINT8_T} kernel_zero_point[4];
+  #    int32_t right_pre_shift;
+  #    int32_t multiplier;
+  #    int32_t right_post_shift;
+  #    int16_t output_zero_point;
+  #    ${XINT8_T} output_min;
+  #    ${XINT8_T} output_max;
+  #  } rndnu_neon;
+  #
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v4  v5  v6
+# C0  x6 v16 v20 v24 v28
+# C1  x8 v17 v21 v25 v29
+# C2  x9 v18 v22 v26 v30
+# C3  x7 v19 v23 v27 v31
+$if DATATYPE == "QU8":
+  # zero_point  v7
+  # unused v8 v9 v10 v11 v12 v13 v14 v15
+$else:
+  # unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x12, x11, [sp]          // Load cn_stride, params
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+        $if DATATYPE == "QU8":
+          LD1R    {v7.4s}, [x11], 4        // kernel_zero_point
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        SUBS    x0, x2, 8               // k = kc - 8
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        # Is there at least 8 bytes for main loop?
+        B.LO    3f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LD1     {v0.8b},  [x3], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], 8
+        LD1     {v2.8b}, [x13], 8
+        LD1     {v3.8b},  [x4], 8
+        ${XXTL}    v0.8h, v0.8b
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        ${XXTL}    v1.8h, v1.8b
+        ${XXTL}    v2.8h, v2.8b
+        ${XXTL}    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x13, 128]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x15, 128]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x3, 128]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x4, 128]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x5, 448]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x5, 512]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    1b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 3f
+
+2:
+        $if REQUANTIZATION == "GEMMLOWP":
+          # Apply params - scale, shift, bias and clamp
+          LD2R    {v4.4s, v5.4s}, [x11], 8
+          CMEQ    v6.4s, v5.4s, 0
+
+          BIC     v0.16b, v16.16b, v6.16b
+          BIC     v1.16b, v17.16b, v6.16b
+          BIC     v2.16b, v18.16b, v6.16b
+          BIC     v3.16b, v19.16b, v6.16b
+
+          SQRDMULH v16.4s, v16.4s, v4.4s
+          SQRDMULH v17.4s, v17.4s, v4.4s
+          SQRDMULH v18.4s, v18.4s, v4.4s
+          SQRDMULH v19.4s, v19.4s, v4.4s
+
+          SSRA    v16.4s, v0.4s, 31       // signed shift right accumulate
+          SSRA    v17.4s, v1.4s, 31
+          SSRA    v18.4s, v2.4s, 31
+          SSRA    v19.4s, v3.4s, 31
+
+          BIC     v0.16b, v20.16b, v6.16b
+          BIC     v1.16b, v21.16b, v6.16b
+          BIC     v2.16b, v22.16b, v6.16b
+          BIC     v3.16b, v23.16b, v6.16b
+
+          SQRDMULH v20.4s, v20.4s, v4.4s
+          SQRDMULH v21.4s, v21.4s, v4.4s
+          SQRDMULH v22.4s, v22.4s, v4.4s
+          SQRDMULH v23.4s, v23.4s, v4.4s
+
+          SSRA    v20.4s, v0.4s, 31
+          SSRA    v21.4s, v1.4s, 31
+          SSRA    v22.4s, v2.4s, 31
+          SSRA    v23.4s, v3.4s, 31
+
+          BIC     v0.16b, v24.16b, v6.16b
+          BIC     v1.16b, v25.16b, v6.16b
+          BIC     v2.16b, v26.16b, v6.16b
+          BIC     v3.16b, v27.16b, v6.16b
+
+          SQRDMULH v24.4s, v24.4s, v4.4s
+          SQRDMULH v25.4s, v25.4s, v4.4s
+          SQRDMULH v26.4s, v26.4s, v4.4s
+          SQRDMULH v27.4s, v27.4s, v4.4s
+
+          SSRA    v24.4s, v0.4s, 31
+          SSRA    v25.4s, v1.4s, 31
+          SSRA    v26.4s, v2.4s, 31
+          SSRA    v27.4s, v3.4s, 31
+
+          BIC     v0.16b, v28.16b, v6.16b
+          BIC     v1.16b, v29.16b, v6.16b
+          BIC     v2.16b, v30.16b, v6.16b
+          BIC     v3.16b, v31.16b, v6.16b
+
+          SQRDMULH v28.4s, v28.4s, v4.4s
+          SQRDMULH v29.4s, v29.4s, v4.4s
+          SQRDMULH v30.4s, v30.4s, v4.4s
+          SQRDMULH v31.4s, v31.4s, v4.4s
+
+          SSRA    v28.4s, v0.4s, 31
+          SSRA    v29.4s, v1.4s, 31
+          SSRA    v30.4s, v2.4s, 31
+          SSRA    v31.4s, v3.4s, 31
+
+          SRSHL   v16.4s, v16.4s, v5.4s   // signed rounding shift left
+          SRSHL   v17.4s, v17.4s, v5.4s
+          SRSHL   v18.4s, v18.4s, v5.4s
+          SRSHL   v19.4s, v19.4s, v5.4s
+          SRSHL   v20.4s, v20.4s, v5.4s
+          SRSHL   v21.4s, v21.4s, v5.4s
+          SRSHL   v22.4s, v22.4s, v5.4s
+          SRSHL   v23.4s, v23.4s, v5.4s
+          SRSHL   v24.4s, v24.4s, v5.4s
+          SRSHL   v25.4s, v25.4s, v5.4s
+          SRSHL   v26.4s, v26.4s, v5.4s
+          SRSHL   v27.4s, v27.4s, v5.4s
+          SRSHL   v28.4s, v28.4s, v5.4s
+          SRSHL   v29.4s, v29.4s, v5.4s
+          SRSHL   v30.4s, v30.4s, v5.4s
+          SRSHL   v31.4s, v31.4s, v5.4s
+        $elif REQUANTIZATION == "RNDNU":
+          # Apply params - preshift, scale, postshift, bias and clamp
+          LD1R    {v4.4s}, [x11], 4
+          SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
+          SSHL    v17.4s, v17.4s, v4.4s
+          SSHL    v18.4s, v18.4s, v4.4s
+          SSHL    v19.4s, v19.4s, v4.4s
+          SSHL    v20.4s, v20.4s, v4.4s
+          SSHL    v21.4s, v21.4s, v4.4s
+          SSHL    v22.4s, v22.4s, v4.4s
+          SSHL    v23.4s, v23.4s, v4.4s
+          LD1R    {v5.4s}, [x11], 4
+          SSHL    v24.4s, v24.4s, v4.4s
+          SSHL    v25.4s, v25.4s, v4.4s
+          SSHL    v26.4s, v26.4s, v4.4s
+          SSHL    v27.4s, v27.4s, v4.4s
+          SSHL    v28.4s, v28.4s, v4.4s
+          SSHL    v29.4s, v29.4s, v4.4s
+          SSHL    v30.4s, v30.4s, v4.4s
+          SSHL    v31.4s, v31.4s, v4.4s
+          LD1R    {v6.4s}, [x11], 4
+          SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
+          SQDMULH v17.4s, v17.4s, v5.4s
+          SQDMULH v18.4s, v18.4s, v5.4s
+          SQDMULH v19.4s, v19.4s, v5.4s
+          SQDMULH v20.4s, v20.4s, v5.4s
+          SQDMULH v21.4s, v21.4s, v5.4s
+          SQDMULH v22.4s, v22.4s, v5.4s
+          SQDMULH v23.4s, v23.4s, v5.4s
+          SQDMULH v24.4s, v24.4s, v5.4s
+          SQDMULH v25.4s, v25.4s, v5.4s
+          SQDMULH v26.4s, v26.4s, v5.4s
+          SQDMULH v27.4s, v27.4s, v5.4s
+          SQDMULH v28.4s, v28.4s, v5.4s
+          SQDMULH v29.4s, v29.4s, v5.4s
+          SQDMULH v30.4s, v30.4s, v5.4s
+          SQDMULH v31.4s, v31.4s, v5.4s
+          SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
+          SRSHL   v17.4s, v17.4s, v6.4s
+          SRSHL   v18.4s, v18.4s, v6.4s
+          SRSHL   v19.4s, v19.4s, v6.4s
+          SRSHL   v20.4s, v20.4s, v6.4s
+          SRSHL   v21.4s, v21.4s, v6.4s
+          SRSHL   v22.4s, v22.4s, v6.4s
+          SRSHL   v23.4s, v23.4s, v6.4s
+          SRSHL   v24.4s, v24.4s, v6.4s
+          SRSHL   v25.4s, v25.4s, v6.4s
+          SRSHL   v26.4s, v26.4s, v6.4s
+          SRSHL   v27.4s, v27.4s, v6.4s
+          SRSHL   v28.4s, v28.4s, v6.4s
+          SRSHL   v29.4s, v29.4s, v6.4s
+          SRSHL   v30.4s, v30.4s, v6.4s
+          SRSHL   v31.4s, v31.4s, v6.4s
+        $elif REQUANTIZATION == "FP32":
+          SCVTF   v16.4s, v16.4s
+          SCVTF   v17.4s, v17.4s
+          $if not CHANNELWISE:
+            # Apply params - scale, bias and clamp
+            LD1R    {v4.4s}, [x11], 4
+            SCVTF   v18.4s, v18.4s
+            SCVTF   v19.4s, v19.4s
+          $else:
+            # Load per channel scale values from weights
+            LDR     q4, [x5], 16
+            SCVTF   v18.4s, v18.4s
+            SCVTF   v19.4s, v19.4s
+            LDR     q5, [x5], 16
+          SCVTF   v20.4s, v20.4s
+          SCVTF   v21.4s, v21.4s
+          SCVTF   v22.4s, v22.4s
+          SCVTF   v23.4s, v23.4s
+          SCVTF   v24.4s, v24.4s
+          SCVTF   v25.4s, v25.4s
+          SCVTF   v26.4s, v26.4s
+          SCVTF   v27.4s, v27.4s
+          SCVTF   v28.4s, v28.4s
+          SCVTF   v29.4s, v29.4s
+          SCVTF   v30.4s, v30.4s
+          SCVTF   v31.4s, v31.4s
+
+          $if CHANNELWISE:
+            LDR     q6, [x5], 16
+            FMUL    v16.4s, v16.4s, v4.4s
+            FMUL    v17.4s, v17.4s, v4.4s
+            FMUL    v18.4s, v18.4s, v4.4s
+            FMUL    v19.4s, v19.4s, v4.4s
+            FMUL    v20.4s, v20.4s, v5.4s
+            LDR     q4, [x5], 16
+            FMUL    v21.4s, v21.4s, v5.4s
+            FMUL    v22.4s, v22.4s, v5.4s
+            FMUL    v23.4s, v23.4s, v5.4s
+            FMUL    v24.4s, v24.4s, v6.4s
+            FMUL    v25.4s, v25.4s, v6.4s
+            FMUL    v26.4s, v26.4s, v6.4s
+            FMUL    v27.4s, v27.4s, v6.4s
+            FMUL    v28.4s, v28.4s, v4.4s
+            FMUL    v29.4s, v29.4s, v4.4s
+            FMUL    v30.4s, v30.4s, v4.4s
+            FMUL    v31.4s, v31.4s, v4.4s
+          $else:
+            FMUL    v16.4s, v16.4s, v4.4s
+            FMUL    v17.4s, v17.4s, v4.4s
+            FMUL    v18.4s, v18.4s, v4.4s
+            FMUL    v19.4s, v19.4s, v4.4s
+            FMUL    v20.4s, v20.4s, v4.4s
+            FMUL    v21.4s, v21.4s, v4.4s
+            FMUL    v22.4s, v22.4s, v4.4s
+            FMUL    v23.4s, v23.4s, v4.4s
+            FMUL    v24.4s, v24.4s, v4.4s
+            FMUL    v25.4s, v25.4s, v4.4s
+            FMUL    v26.4s, v26.4s, v4.4s
+            FMUL    v27.4s, v27.4s, v4.4s
+            FMUL    v28.4s, v28.4s, v4.4s
+            FMUL    v29.4s, v29.4s, v4.4s
+            FMUL    v30.4s, v30.4s, v4.4s
+            FMUL    v31.4s, v31.4s, v4.4s
+
+          FCVTNS  v16.4s, v16.4s
+          FCVTNS  v17.4s, v17.4s
+          FCVTNS  v18.4s, v18.4s
+          FCVTNS  v19.4s, v19.4s
+          FCVTNS  v20.4s, v20.4s
+          FCVTNS  v21.4s, v21.4s
+          FCVTNS  v22.4s, v22.4s
+          FCVTNS  v23.4s, v23.4s
+          FCVTNS  v24.4s, v24.4s
+          FCVTNS  v25.4s, v25.4s
+          FCVTNS  v26.4s, v26.4s
+          FCVTNS  v27.4s, v27.4s
+          FCVTNS  v28.4s, v28.4s
+          FCVTNS  v29.4s, v29.4s
+          FCVTNS  v30.4s, v30.4s
+          FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1      // clamp min value
+
+        ${SQXTXN}   v0.8b, v16.8h
+        ${SQXTXN}   v1.8b, v17.8h
+        ${SQXTXN}   v2.8b, v18.8h
+        ${SQXTXN}   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]         // clamp max value
+        ${SQXTXN2}  v0.16b, v24.8h
+        ${SQXTXN2}  v1.16b, v25.8h
+        ${SQXTXN2}  v2.16b, v26.8h
+        ${SQXTXN2}  v3.16b, v27.8h
+        SUB     x11, x11, ${REWIND_DECREMENT}             // rewind params pointer
+
+        ${XMAX}    v0.16b, v0.16b, v4.16b
+        ${XMAX}    v1.16b, v1.16b, v4.16b
+        ${XMAX}    v2.16b, v2.16b, v4.16b
+        ${XMAX}    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        ${XMIN}    v0.16b, v0.16b, v5.16b
+        ${XMIN}    v1.16b, v1.16b, v5.16b
+        ${XMIN}    v2.16b, v2.16b, v5.16b
+        ${XMIN}    v3.16b, v3.16b, v5.16b
+        B.LO    4f
+
+        # Store full 4 x 16
+        ST1     {v0.16b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v1.16b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v2.16b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v3.16b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+3:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b},  [x3], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], x0
+        LD1     {v2.8b}, [x13], x0
+        LD1     {v3.8b},  [x4], x0
+        ${XXTL}    v0.8h, v0.8b
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        ${XXTL}    v1.8h, v1.8b
+        ${XXTL}    v2.8h, v2.8b
+        ${XXTL}    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       2b
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 3, 5f
+        STR     d0, [x6], 8
+        STR     d1, [x8], 8
+        DUP     d0, v0.d[1]
+        DUP     d1, v1.d[1]
+        STR     d2, [x9], 8
+        STR     d3, [x7], 8
+        DUP     d2, v2.d[1]
+        DUP     d3, v3.d[1]
+5:
+        TBZ     x1, 2, 6f
+        STR     s0, [x6], 4
+        STR     s1, [x8], 4
+        DUP     s0, v0.s[1]
+        DUP     s1, v1.s[1]
+        STR     s2, [x9], 4
+        STR     s3, [x7], 4
+        DUP     s2, v2.s[1]
+        DUP     s3, v3.s[1]
+6:
+        TBZ     x1, 1, 7f
+        STR     h0, [x6], 2
+        STR     h1, [x8], 2
+        DUP     h0, v0.h[1]
+        DUP     h1, v1.h[1]
+        STR     h2, [x9], 2
+        STR     h3, [x7], 2
+        DUP     h2, v2.h[1]
+        DUP     h3, v3.h[1]
+7:
+        TBZ     x1, 0, 8f
+        STR     b0, [x6]
+        STR     b1, [x8]
+        STR     b2, [x9]
+        STR     b3, [x7]
+8:
+        RET
+
+END_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..d552539
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,587 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v4  v5  v6
+# C0  x6 v16 v20 v24 v28
+# C1  x8 v17 v21 v25 v29
+# C2  x9 v18 v22 v26 v30
+# C3  x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x12, x11, [sp]          // Load cn_stride, params
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        SUBS    x0, x2, 8               // k = kc - 8
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        # Is there at least 8 bytes for main loop?
+        B.LO    3f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LD1     {v0.8b},  [x3], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], 8
+        LD1     {v2.8b}, [x13], 8
+        LD1     {v3.8b},  [x4], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    1b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 3f
+
+2:
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        # Apply params - scale, bias and clamp
+        LD1R    {v4.4s}, [x11], 4
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        FMUL    v16.4s, v16.4s, v4.4s
+        FMUL    v17.4s, v17.4s, v4.4s
+        FMUL    v18.4s, v18.4s, v4.4s
+        FMUL    v19.4s, v19.4s, v4.4s
+        FMUL    v20.4s, v20.4s, v4.4s
+        FMUL    v21.4s, v21.4s, v4.4s
+        FMUL    v22.4s, v22.4s, v4.4s
+        FMUL    v23.4s, v23.4s, v4.4s
+        FMUL    v24.4s, v24.4s, v4.4s
+        FMUL    v25.4s, v25.4s, v4.4s
+        FMUL    v26.4s, v26.4s, v4.4s
+        FMUL    v27.4s, v27.4s, v4.4s
+        FMUL    v28.4s, v28.4s, v4.4s
+        FMUL    v29.4s, v29.4s, v4.4s
+        FMUL    v30.4s, v30.4s, v4.4s
+        FMUL    v31.4s, v31.4s, v4.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1      // clamp min value
+
+        SQXTN   v0.8b, v16.8h
+        SQXTN   v1.8b, v17.8h
+        SQXTN   v2.8b, v18.8h
+        SQXTN   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]         // clamp max value
+        SQXTN2  v0.16b, v24.8h
+        SQXTN2  v1.16b, v25.8h
+        SQXTN2  v2.16b, v26.8h
+        SQXTN2  v3.16b, v27.8h
+        SUB     x11, x11, 7             // rewind params pointer
+
+        SMAX    v0.16b, v0.16b, v4.16b
+        SMAX    v1.16b, v1.16b, v4.16b
+        SMAX    v2.16b, v2.16b, v4.16b
+        SMAX    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        SMIN    v0.16b, v0.16b, v5.16b
+        SMIN    v1.16b, v1.16b, v5.16b
+        SMIN    v2.16b, v2.16b, v5.16b
+        SMIN    v3.16b, v3.16b, v5.16b
+        B.LO    4f
+
+        # Store full 4 x 16
+        ST1     {v0.16b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v1.16b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v2.16b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v3.16b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+3:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b},  [x3], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], x0
+        LD1     {v2.8b}, [x13], x0
+        LD1     {v3.8b},  [x4], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       2b
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 3, 5f
+        STR     d0, [x6], 8
+        STR     d1, [x8], 8
+        DUP     d0, v0.d[1]
+        DUP     d1, v1.d[1]
+        STR     d2, [x9], 8
+        STR     d3, [x7], 8
+        DUP     d2, v2.d[1]
+        DUP     d3, v3.d[1]
+5:
+        TBZ     x1, 2, 6f
+        STR     s0, [x6], 4
+        STR     s1, [x8], 4
+        DUP     s0, v0.s[1]
+        DUP     s1, v1.s[1]
+        STR     s2, [x9], 4
+        STR     s3, [x7], 4
+        DUP     s2, v2.s[1]
+        DUP     s3, v3.s[1]
+6:
+        TBZ     x1, 1, 7f
+        STR     h0, [x6], 2
+        STR     h1, [x8], 2
+        DUP     h0, v0.h[1]
+        DUP     h1, v1.h[1]
+        STR     h2, [x9], 2
+        STR     h3, [x7], 2
+        DUP     h2, v2.h[1]
+        DUP     h3, v3.h[1]
+7:
+        TBZ     x1, 0, 8f
+        STR     b0, [x6]
+        STR     b1, [x8]
+        STR     b2, [x9]
+        STR     b3, [x7]
+8:
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..6f95707
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,593 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v4  v5  v6
+# C0  x6 v16 v20 v24 v28
+# C1  x8 v17 v21 v25 v29
+# C2  x9 v18 v22 v26 v30
+# C3  x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x12, x11, [sp]          // Load cn_stride, params
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        SUBS    x0, x2, 8               // k = kc - 8
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        # Is there at least 8 bytes for main loop?
+        B.LO    3f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LD1     {v0.8b},  [x3], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], 8
+        LD1     {v2.8b}, [x13], 8
+        LD1     {v3.8b},  [x4], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x13, 128]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x15, 128]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x3, 128]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x4, 128]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 448]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 512]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    1b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 3f
+
+2:
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        # Apply params - scale, bias and clamp
+        LD1R    {v4.4s}, [x11], 4
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        FMUL    v16.4s, v16.4s, v4.4s
+        FMUL    v17.4s, v17.4s, v4.4s
+        FMUL    v18.4s, v18.4s, v4.4s
+        FMUL    v19.4s, v19.4s, v4.4s
+        FMUL    v20.4s, v20.4s, v4.4s
+        FMUL    v21.4s, v21.4s, v4.4s
+        FMUL    v22.4s, v22.4s, v4.4s
+        FMUL    v23.4s, v23.4s, v4.4s
+        FMUL    v24.4s, v24.4s, v4.4s
+        FMUL    v25.4s, v25.4s, v4.4s
+        FMUL    v26.4s, v26.4s, v4.4s
+        FMUL    v27.4s, v27.4s, v4.4s
+        FMUL    v28.4s, v28.4s, v4.4s
+        FMUL    v29.4s, v29.4s, v4.4s
+        FMUL    v30.4s, v30.4s, v4.4s
+        FMUL    v31.4s, v31.4s, v4.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1      // clamp min value
+
+        SQXTN   v0.8b, v16.8h
+        SQXTN   v1.8b, v17.8h
+        SQXTN   v2.8b, v18.8h
+        SQXTN   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]         // clamp max value
+        SQXTN2  v0.16b, v24.8h
+        SQXTN2  v1.16b, v25.8h
+        SQXTN2  v2.16b, v26.8h
+        SQXTN2  v3.16b, v27.8h
+        SUB     x11, x11, 7             // rewind params pointer
+
+        SMAX    v0.16b, v0.16b, v4.16b
+        SMAX    v1.16b, v1.16b, v4.16b
+        SMAX    v2.16b, v2.16b, v4.16b
+        SMAX    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        SMIN    v0.16b, v0.16b, v5.16b
+        SMIN    v1.16b, v1.16b, v5.16b
+        SMIN    v2.16b, v2.16b, v5.16b
+        SMIN    v3.16b, v3.16b, v5.16b
+        B.LO    4f
+
+        # Store full 4 x 16
+        ST1     {v0.16b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v1.16b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v2.16b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v3.16b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+3:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b},  [x3], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], x0
+        LD1     {v2.8b}, [x13], x0
+        LD1     {v3.8b},  [x4], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       2b
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 3, 5f
+        STR     d0, [x6], 8
+        STR     d1, [x8], 8
+        DUP     d0, v0.d[1]
+        DUP     d1, v1.d[1]
+        STR     d2, [x9], 8
+        STR     d3, [x7], 8
+        DUP     d2, v2.d[1]
+        DUP     d3, v3.d[1]
+5:
+        TBZ     x1, 2, 6f
+        STR     s0, [x6], 4
+        STR     s1, [x8], 4
+        DUP     s0, v0.s[1]
+        DUP     s1, v1.s[1]
+        STR     s2, [x9], 4
+        STR     s3, [x7], 4
+        DUP     s2, v2.s[1]
+        DUP     s3, v3.s[1]
+6:
+        TBZ     x1, 1, 7f
+        STR     h0, [x6], 2
+        STR     h1, [x8], 2
+        DUP     h0, v0.h[1]
+        DUP     h1, v1.h[1]
+        STR     h2, [x9], 2
+        STR     h3, [x7], 2
+        DUP     h2, v2.h[1]
+        DUP     h3, v3.h[1]
+7:
+        TBZ     x1, 0, 8f
+        STR     b0, [x6]
+        STR     b1, [x8]
+        STR     b2, [x9]
+        STR     b3, [x7]
+8:
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..ebd7491
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,587 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v4  v5  v6
+# C0  x6 v16 v20 v24 v28
+# C1  x8 v17 v21 v25 v29
+# C2  x9 v18 v22 v26 v30
+# C3  x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x12, x11, [sp]          // Load cn_stride, params
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        SUBS    x0, x2, 8               // k = kc - 8
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        # Is there at least 8 bytes for main loop?
+        B.LO    3f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LD1     {v0.8b},  [x3], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], 8
+        LD1     {v2.8b}, [x13], 8
+        LD1     {v3.8b},  [x4], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    1b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 3f
+
+2:
+        # Apply params - preshift, scale, postshift, bias and clamp
+        LD1R    {v4.4s}, [x11], 4
+        SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
+        SSHL    v17.4s, v17.4s, v4.4s
+        SSHL    v18.4s, v18.4s, v4.4s
+        SSHL    v19.4s, v19.4s, v4.4s
+        SSHL    v20.4s, v20.4s, v4.4s
+        SSHL    v21.4s, v21.4s, v4.4s
+        SSHL    v22.4s, v22.4s, v4.4s
+        SSHL    v23.4s, v23.4s, v4.4s
+        LD1R    {v5.4s}, [x11], 4
+        SSHL    v24.4s, v24.4s, v4.4s
+        SSHL    v25.4s, v25.4s, v4.4s
+        SSHL    v26.4s, v26.4s, v4.4s
+        SSHL    v27.4s, v27.4s, v4.4s
+        SSHL    v28.4s, v28.4s, v4.4s
+        SSHL    v29.4s, v29.4s, v4.4s
+        SSHL    v30.4s, v30.4s, v4.4s
+        SSHL    v31.4s, v31.4s, v4.4s
+        LD1R    {v6.4s}, [x11], 4
+        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
+        SQDMULH v17.4s, v17.4s, v5.4s
+        SQDMULH v18.4s, v18.4s, v5.4s
+        SQDMULH v19.4s, v19.4s, v5.4s
+        SQDMULH v20.4s, v20.4s, v5.4s
+        SQDMULH v21.4s, v21.4s, v5.4s
+        SQDMULH v22.4s, v22.4s, v5.4s
+        SQDMULH v23.4s, v23.4s, v5.4s
+        SQDMULH v24.4s, v24.4s, v5.4s
+        SQDMULH v25.4s, v25.4s, v5.4s
+        SQDMULH v26.4s, v26.4s, v5.4s
+        SQDMULH v27.4s, v27.4s, v5.4s
+        SQDMULH v28.4s, v28.4s, v5.4s
+        SQDMULH v29.4s, v29.4s, v5.4s
+        SQDMULH v30.4s, v30.4s, v5.4s
+        SQDMULH v31.4s, v31.4s, v5.4s
+        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
+        SRSHL   v17.4s, v17.4s, v6.4s
+        SRSHL   v18.4s, v18.4s, v6.4s
+        SRSHL   v19.4s, v19.4s, v6.4s
+        SRSHL   v20.4s, v20.4s, v6.4s
+        SRSHL   v21.4s, v21.4s, v6.4s
+        SRSHL   v22.4s, v22.4s, v6.4s
+        SRSHL   v23.4s, v23.4s, v6.4s
+        SRSHL   v24.4s, v24.4s, v6.4s
+        SRSHL   v25.4s, v25.4s, v6.4s
+        SRSHL   v26.4s, v26.4s, v6.4s
+        SRSHL   v27.4s, v27.4s, v6.4s
+        SRSHL   v28.4s, v28.4s, v6.4s
+        SRSHL   v29.4s, v29.4s, v6.4s
+        SRSHL   v30.4s, v30.4s, v6.4s
+        SRSHL   v31.4s, v31.4s, v6.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1      // clamp min value
+
+        SQXTN   v0.8b, v16.8h
+        SQXTN   v1.8b, v17.8h
+        SQXTN   v2.8b, v18.8h
+        SQXTN   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]         // clamp max value
+        SQXTN2  v0.16b, v24.8h
+        SQXTN2  v1.16b, v25.8h
+        SQXTN2  v2.16b, v26.8h
+        SQXTN2  v3.16b, v27.8h
+        SUB     x11, x11, 15             // rewind params pointer
+
+        SMAX    v0.16b, v0.16b, v4.16b
+        SMAX    v1.16b, v1.16b, v4.16b
+        SMAX    v2.16b, v2.16b, v4.16b
+        SMAX    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        SMIN    v0.16b, v0.16b, v5.16b
+        SMIN    v1.16b, v1.16b, v5.16b
+        SMIN    v2.16b, v2.16b, v5.16b
+        SMIN    v3.16b, v3.16b, v5.16b
+        B.LO    4f
+
+        # Store full 4 x 16
+        ST1     {v0.16b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v1.16b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v2.16b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v3.16b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+3:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b},  [x3], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], x0
+        LD1     {v2.8b}, [x13], x0
+        LD1     {v3.8b},  [x4], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       2b
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 3, 5f
+        STR     d0, [x6], 8
+        STR     d1, [x8], 8
+        DUP     d0, v0.d[1]
+        DUP     d1, v1.d[1]
+        STR     d2, [x9], 8
+        STR     d3, [x7], 8
+        DUP     d2, v2.d[1]
+        DUP     d3, v3.d[1]
+5:
+        TBZ     x1, 2, 6f
+        STR     s0, [x6], 4
+        STR     s1, [x8], 4
+        DUP     s0, v0.s[1]
+        DUP     s1, v1.s[1]
+        STR     s2, [x9], 4
+        STR     s3, [x7], 4
+        DUP     s2, v2.s[1]
+        DUP     s3, v3.s[1]
+6:
+        TBZ     x1, 1, 7f
+        STR     h0, [x6], 2
+        STR     h1, [x8], 2
+        DUP     h0, v0.h[1]
+        DUP     h1, v1.h[1]
+        STR     h2, [x9], 2
+        STR     h3, [x7], 2
+        DUP     h2, v2.h[1]
+        DUP     h3, v3.h[1]
+7:
+        TBZ     x1, 0, 8f
+        STR     b0, [x6]
+        STR     b1, [x8]
+        STR     b2, [x9]
+        STR     b3, [x7]
+8:
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..0836cbd
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,593 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     const int8_t* restrict a,  x3
+#     size_t a_stride,           x4
+#     const void* restrict w,    x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,          [sp] -> x12
+#     const union xnn_qs8_conv_minmax_params params)  [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3  x4 v3
+# B   x5 v4  v5  v6
+# C0  x6 v16 v20 v24 v28
+# C1  x8 v17 v21 v25 v29
+# C2  x9 v18 v22 v26 v30
+# C3  x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+        # Clamp A and C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x12, x11, [sp]          // Load cn_stride, params
+        ADD     x15, x3, x4             // a1 = a0 + a_stride
+        ADD     x8, x6, x7              // c1 = c0 + cm_stride
+        CSEL    x15, x3, x15, LO        //   a1 = a0
+        CSEL    x8, x6,  x8, LO         //   c1 = c0
+
+        ADD     x13, x15, x4            // a2 = a1 + a_stride
+        ADD     x9,  x8, x7             // c2 = c1 + cm_stride
+                                        // if mr <= 2
+        CSEL    x13, x15, x13, LS       //   a2 = a1
+        CSEL    x9,  x8,  x9, LS        //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        ADD     x4, x13, x4             // a3 = a2 + a_stride
+        ADD     x7,  x9, x7             // c3 = c2 + cm_stride
+        CSEL    x4, x13, x4, LO         //   a3 = a2
+        CSEL    x7,  x9, x7, LO         //   c3 = c2
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        SUBS    x0, x2, 8               // k = kc - 8
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        # Is there at least 8 bytes for main loop?
+        B.LO    3f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+1:
+        LD1     {v0.8b},  [x3], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], 8
+        LD1     {v2.8b}, [x13], 8
+        LD1     {v3.8b},  [x4], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x13, 128]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x15, 128]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x3, 128]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x4, 128]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 448]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 512]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    1b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 3f
+
+2:
+        # Apply params - preshift, scale, postshift, bias and clamp
+        LD1R    {v4.4s}, [x11], 4
+        SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
+        SSHL    v17.4s, v17.4s, v4.4s
+        SSHL    v18.4s, v18.4s, v4.4s
+        SSHL    v19.4s, v19.4s, v4.4s
+        SSHL    v20.4s, v20.4s, v4.4s
+        SSHL    v21.4s, v21.4s, v4.4s
+        SSHL    v22.4s, v22.4s, v4.4s
+        SSHL    v23.4s, v23.4s, v4.4s
+        LD1R    {v5.4s}, [x11], 4
+        SSHL    v24.4s, v24.4s, v4.4s
+        SSHL    v25.4s, v25.4s, v4.4s
+        SSHL    v26.4s, v26.4s, v4.4s
+        SSHL    v27.4s, v27.4s, v4.4s
+        SSHL    v28.4s, v28.4s, v4.4s
+        SSHL    v29.4s, v29.4s, v4.4s
+        SSHL    v30.4s, v30.4s, v4.4s
+        SSHL    v31.4s, v31.4s, v4.4s
+        LD1R    {v6.4s}, [x11], 4
+        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
+        SQDMULH v17.4s, v17.4s, v5.4s
+        SQDMULH v18.4s, v18.4s, v5.4s
+        SQDMULH v19.4s, v19.4s, v5.4s
+        SQDMULH v20.4s, v20.4s, v5.4s
+        SQDMULH v21.4s, v21.4s, v5.4s
+        SQDMULH v22.4s, v22.4s, v5.4s
+        SQDMULH v23.4s, v23.4s, v5.4s
+        SQDMULH v24.4s, v24.4s, v5.4s
+        SQDMULH v25.4s, v25.4s, v5.4s
+        SQDMULH v26.4s, v26.4s, v5.4s
+        SQDMULH v27.4s, v27.4s, v5.4s
+        SQDMULH v28.4s, v28.4s, v5.4s
+        SQDMULH v29.4s, v29.4s, v5.4s
+        SQDMULH v30.4s, v30.4s, v5.4s
+        SQDMULH v31.4s, v31.4s, v5.4s
+        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
+        SRSHL   v17.4s, v17.4s, v6.4s
+        SRSHL   v18.4s, v18.4s, v6.4s
+        SRSHL   v19.4s, v19.4s, v6.4s
+        SRSHL   v20.4s, v20.4s, v6.4s
+        SRSHL   v21.4s, v21.4s, v6.4s
+        SRSHL   v22.4s, v22.4s, v6.4s
+        SRSHL   v23.4s, v23.4s, v6.4s
+        SRSHL   v24.4s, v24.4s, v6.4s
+        SRSHL   v25.4s, v25.4s, v6.4s
+        SRSHL   v26.4s, v26.4s, v6.4s
+        SRSHL   v27.4s, v27.4s, v6.4s
+        SRSHL   v28.4s, v28.4s, v6.4s
+        SRSHL   v29.4s, v29.4s, v6.4s
+        SRSHL   v30.4s, v30.4s, v6.4s
+        SRSHL   v31.4s, v31.4s, v6.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2       // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1      // clamp min value
+
+        SQXTN   v0.8b, v16.8h
+        SQXTN   v1.8b, v17.8h
+        SQXTN   v2.8b, v18.8h
+        SQXTN   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]         // clamp max value
+        SQXTN2  v0.16b, v24.8h
+        SQXTN2  v1.16b, v25.8h
+        SQXTN2  v2.16b, v26.8h
+        SQXTN2  v3.16b, v27.8h
+        SUB     x11, x11, 15             // rewind params pointer
+
+        SMAX    v0.16b, v0.16b, v4.16b
+        SMAX    v1.16b, v1.16b, v4.16b
+        SMAX    v2.16b, v2.16b, v4.16b
+        SMAX    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        SMIN    v0.16b, v0.16b, v5.16b
+        SMIN    v1.16b, v1.16b, v5.16b
+        SMIN    v2.16b, v2.16b, v5.16b
+        SMIN    v3.16b, v3.16b, v5.16b
+        B.LO    4f
+
+        # Store full 4 x 16
+        ST1     {v0.16b}, [x6], x12
+        SUB     x3,  x3, x2             // a0 -= kc
+        ST1     {v1.16b}, [x8], x12
+        SUB     x15, x15, x2            // a1 -= kc
+        ST1     {v2.16b}, [x9], x12
+        SUB     x13, x13, x2            // a2 -= kc
+        ST1     {v3.16b}, [x7], x12
+        SUB     x4,  x4, x2             // a3 -= kc
+        B.NE    0b
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+3:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b},  [x3], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x15], x0
+        LD1     {v2.8b}, [x13], x0
+        LD1     {v3.8b},  [x4], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    2b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       2b
+
+        # Store odd width
+        .p2align 3
+4:
+        TBZ     x1, 3, 5f
+        STR     d0, [x6], 8
+        STR     d1, [x8], 8
+        DUP     d0, v0.d[1]
+        DUP     d1, v1.d[1]
+        STR     d2, [x9], 8
+        STR     d3, [x7], 8
+        DUP     d2, v2.d[1]
+        DUP     d3, v3.d[1]
+5:
+        TBZ     x1, 2, 6f
+        STR     s0, [x6], 4
+        STR     s1, [x8], 4
+        DUP     s0, v0.s[1]
+        DUP     s1, v1.s[1]
+        STR     s2, [x9], 4
+        STR     s3, [x7], 4
+        DUP     s2, v2.s[1]
+        DUP     s3, v3.s[1]
+6:
+        TBZ     x1, 1, 7f
+        STR     h0, [x6], 2
+        STR     h1, [x8], 2
+        DUP     h0, v0.h[1]
+        DUP     h1, v1.h[1]
+        STR     h2, [x9], 2
+        STR     h3, [x7], 2
+        DUP     h2, v2.h[1]
+        DUP     h3, v3.h[1]
+7:
+        TBZ     x1, 0, 8f
+        STR     b0, [x6]
+        STR     b1, [x8]
+        STR     b2, [x9]
+        STR     b3, [x7]
+8:
+        RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
new file mode 100644
index 0000000..9730957
--- /dev/null
+++ b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
@@ -0,0 +1,911 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert REQUANTIZATION in ["FP32", "GEMMLOWP", "RNDNU"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
+$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
+$assert DATATYPE != "QU8" or REQUANTIZATION == "RNDNU"
+
+#include <xnnpack/assembly.h>
+
+$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+$if DATATYPE == "QU8":
+  $REWIND_DECREMENT = 19
+$else:
+  $REWIND_DECREMENT = 3 if CHANNELWISE else {"GEMMLOWP": 11, "RNDNU": 15, "FP32": 7}[REQUANTIZATION]
+$XMIN = "UMIN" if DATATYPE == "QU8" else "SMIN"
+$XMAX = "UMAX" if DATATYPE == "QU8" else "SMAX"
+$XXTL = "UXTL" if DATATYPE == "QU8" else "SXTL"
+$SQXTXN = "SQXTUN" if DATATYPE == "QU8" else "SQXTN"
+$SQXTXN2 = "SQXTUN2" if DATATYPE == "QU8" else "SQXTN2"
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
+# void xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const ${XINT8_T}** restrict a, x4
+#     const ${XINT8_T}* restrict w,  x5
+#     ${XINT8_T}* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x8
+#     const ${XINT8_T}* zero,                [sp + 16] -> x12
+#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+$if REQUANTIZATION == "RNDNU" and DATATYPE == "QU8":
+  # params structure is 20 bytes
+  #  struct {
+  #    ${XINT8_T} kernel_zero_point[4];
+  #    int32_t right_pre_shift;
+  #    int32_t multiplier;
+  #    int32_t right_post_shift;
+  #    int16_t output_zero_point;
+  #    ${XINT8_T} output_min;
+  #    ${XINT8_T} output_max;
+  #  } rndnu_neon;
+  #
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0
+# A1  x14  v1
+# A2  x15  v2
+# A3  x20  v3
+# B    x5  v4  v5  v6
+# C0   x6 v16 v20 v24 v28
+# C1  x16 v17 v21 v25 v29
+# C2  x17 v18 v22 v26 v30
+# C3   x7 v19 v23 v27 v31
+$if DATATYPE == "QU8":
+  # zero_point v7
+  # unused  v8 v9 v10 v11 v12 v13 v14 v15
+$else:
+  # unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+        $if DATATYPE == "QU8":
+          LD1R    {v7.4s}, [x11]          // kernel_zero_point
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        $if DATATYPE == "QU8":
+          ADD     x11, x11, 4              // adjust params pointer
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x20, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x8            // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x8            // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x8            // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x20, x12                // if a3 == zero
+        ADD     x20, x20, x8            // a3 += a_offset
+        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 8 bytes for main loop?
+        SUBS    x0, x2, 8               // k = kc - 8
+        B.LO    4f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+2:
+        LD1     {v0.8b}, [x13], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], 8
+        LD1     {v2.8b}, [x15], 8
+        LD1     {v3.8b}, [x20], 8
+        ${XXTL}    v0.8h, v0.8b
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        ${XXTL}    v1.8h, v1.8b
+        ${XXTL}    v2.8h, v2.8b
+        ${XXTL}    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x13, 128]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x14, 128]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x15, 128]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x20, 128]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x5, 448]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        $if PREFETCH:
+          PRFM    PLDL1KEEP, [x5, 512]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    2b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 4f
+
+3:
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(${XINT8_T}*)
+        B.HI    1b
+
+        $if REQUANTIZATION == "GEMMLOWP":
+          # Apply params - scale, shift, bias and clamp
+          LD2R    {v4.4s, v5.4s}, [x11], 8
+          CMEQ    v6.4s, v5.4s, 0
+
+          BIC     v0.16b, v16.16b, v6.16b
+          BIC     v1.16b, v17.16b, v6.16b
+          BIC     v2.16b, v18.16b, v6.16b
+          BIC     v3.16b, v19.16b, v6.16b
+
+          SQRDMULH v16.4s, v16.4s, v4.4s
+          SQRDMULH v17.4s, v17.4s, v4.4s
+          SQRDMULH v18.4s, v18.4s, v4.4s
+          SQRDMULH v19.4s, v19.4s, v4.4s
+
+          SSRA    v16.4s, v0.4s, 31       // signed shift right accumulate
+          SSRA    v17.4s, v1.4s, 31
+          SSRA    v18.4s, v2.4s, 31
+          SSRA    v19.4s, v3.4s, 31
+
+          BIC     v0.16b, v20.16b, v6.16b
+          BIC     v1.16b, v21.16b, v6.16b
+          BIC     v2.16b, v22.16b, v6.16b
+          BIC     v3.16b, v23.16b, v6.16b
+
+          SQRDMULH v20.4s, v20.4s, v4.4s
+          SQRDMULH v21.4s, v21.4s, v4.4s
+          SQRDMULH v22.4s, v22.4s, v4.4s
+          SQRDMULH v23.4s, v23.4s, v4.4s
+
+          SSRA    v20.4s, v0.4s, 31
+          SSRA    v21.4s, v1.4s, 31
+          SSRA    v22.4s, v2.4s, 31
+          SSRA    v23.4s, v3.4s, 31
+
+          BIC     v0.16b, v24.16b, v6.16b
+          BIC     v1.16b, v25.16b, v6.16b
+          BIC     v2.16b, v26.16b, v6.16b
+          BIC     v3.16b, v27.16b, v6.16b
+
+          SQRDMULH v24.4s, v24.4s, v4.4s
+          SQRDMULH v25.4s, v25.4s, v4.4s
+          SQRDMULH v26.4s, v26.4s, v4.4s
+          SQRDMULH v27.4s, v27.4s, v4.4s
+
+          SSRA    v24.4s, v0.4s, 31
+          SSRA    v25.4s, v1.4s, 31
+          SSRA    v26.4s, v2.4s, 31
+          SSRA    v27.4s, v3.4s, 31
+
+          BIC     v0.16b, v28.16b, v6.16b
+          BIC     v1.16b, v29.16b, v6.16b
+          BIC     v2.16b, v30.16b, v6.16b
+          BIC     v3.16b, v31.16b, v6.16b
+
+          SQRDMULH v28.4s, v28.4s, v4.4s
+          SQRDMULH v29.4s, v29.4s, v4.4s
+          SQRDMULH v30.4s, v30.4s, v4.4s
+          SQRDMULH v31.4s, v31.4s, v4.4s
+
+          SSRA    v28.4s, v0.4s, 31
+          SSRA    v29.4s, v1.4s, 31
+          SSRA    v30.4s, v2.4s, 31
+          SSRA    v31.4s, v3.4s, 31
+
+          SRSHL   v16.4s, v16.4s, v5.4s   // signed rounding shift left
+          SRSHL   v17.4s, v17.4s, v5.4s
+          SRSHL   v18.4s, v18.4s, v5.4s
+          SRSHL   v19.4s, v19.4s, v5.4s
+          SRSHL   v20.4s, v20.4s, v5.4s
+          SRSHL   v21.4s, v21.4s, v5.4s
+          SRSHL   v22.4s, v22.4s, v5.4s
+          SRSHL   v23.4s, v23.4s, v5.4s
+          SRSHL   v24.4s, v24.4s, v5.4s
+          SRSHL   v25.4s, v25.4s, v5.4s
+          SRSHL   v26.4s, v26.4s, v5.4s
+          SRSHL   v27.4s, v27.4s, v5.4s
+          SRSHL   v28.4s, v28.4s, v5.4s
+          SRSHL   v29.4s, v29.4s, v5.4s
+          SRSHL   v30.4s, v30.4s, v5.4s
+          SRSHL   v31.4s, v31.4s, v5.4s
+        $elif REQUANTIZATION == "RNDNU":
+          # Apply params - preshift, scale, postshift, bias and clamp
+          LD1R    {v4.4s}, [x11], 4
+          SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
+          SSHL    v17.4s, v17.4s, v4.4s
+          SSHL    v18.4s, v18.4s, v4.4s
+          SSHL    v19.4s, v19.4s, v4.4s
+          SSHL    v20.4s, v20.4s, v4.4s
+          SSHL    v21.4s, v21.4s, v4.4s
+          SSHL    v22.4s, v22.4s, v4.4s
+          SSHL    v23.4s, v23.4s, v4.4s
+          LD1R    {v5.4s}, [x11], 4
+          SSHL    v24.4s, v24.4s, v4.4s
+          SSHL    v25.4s, v25.4s, v4.4s
+          SSHL    v26.4s, v26.4s, v4.4s
+          SSHL    v27.4s, v27.4s, v4.4s
+          SSHL    v28.4s, v28.4s, v4.4s
+          SSHL    v29.4s, v29.4s, v4.4s
+          SSHL    v30.4s, v30.4s, v4.4s
+          SSHL    v31.4s, v31.4s, v4.4s
+          LD1R    {v6.4s}, [x11], 4
+          SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
+          SQDMULH v17.4s, v17.4s, v5.4s
+          SQDMULH v18.4s, v18.4s, v5.4s
+          SQDMULH v19.4s, v19.4s, v5.4s
+          SQDMULH v20.4s, v20.4s, v5.4s
+          SQDMULH v21.4s, v21.4s, v5.4s
+          SQDMULH v22.4s, v22.4s, v5.4s
+          SQDMULH v23.4s, v23.4s, v5.4s
+          SQDMULH v24.4s, v24.4s, v5.4s
+          SQDMULH v25.4s, v25.4s, v5.4s
+          SQDMULH v26.4s, v26.4s, v5.4s
+          SQDMULH v27.4s, v27.4s, v5.4s
+          SQDMULH v28.4s, v28.4s, v5.4s
+          SQDMULH v29.4s, v29.4s, v5.4s
+          SQDMULH v30.4s, v30.4s, v5.4s
+          SQDMULH v31.4s, v31.4s, v5.4s
+          SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
+          SRSHL   v17.4s, v17.4s, v6.4s
+          SRSHL   v18.4s, v18.4s, v6.4s
+          SRSHL   v19.4s, v19.4s, v6.4s
+          SRSHL   v20.4s, v20.4s, v6.4s
+          SRSHL   v21.4s, v21.4s, v6.4s
+          SRSHL   v22.4s, v22.4s, v6.4s
+          SRSHL   v23.4s, v23.4s, v6.4s
+          SRSHL   v24.4s, v24.4s, v6.4s
+          SRSHL   v25.4s, v25.4s, v6.4s
+          SRSHL   v26.4s, v26.4s, v6.4s
+          SRSHL   v27.4s, v27.4s, v6.4s
+          SRSHL   v28.4s, v28.4s, v6.4s
+          SRSHL   v29.4s, v29.4s, v6.4s
+          SRSHL   v30.4s, v30.4s, v6.4s
+          SRSHL   v31.4s, v31.4s, v6.4s
+        $elif REQUANTIZATION == "FP32":
+          SCVTF   v16.4s, v16.4s
+          SCVTF   v17.4s, v17.4s
+          $if not CHANNELWISE:
+            # Apply params - scale, bias and clamp
+            LD1R    {v4.4s}, [x11], 4
+            SCVTF   v18.4s, v18.4s
+            SCVTF   v19.4s, v19.4s
+          $else:
+            # Load per channel scale values from weights
+            LDR     q4, [x5], 16
+            SCVTF   v18.4s, v18.4s
+            SCVTF   v19.4s, v19.4s
+            LDR     q5, [x5], 16
+          SCVTF   v20.4s, v20.4s
+          SCVTF   v21.4s, v21.4s
+          SCVTF   v22.4s, v22.4s
+          SCVTF   v23.4s, v23.4s
+          SCVTF   v24.4s, v24.4s
+          SCVTF   v25.4s, v25.4s
+          SCVTF   v26.4s, v26.4s
+          SCVTF   v27.4s, v27.4s
+          SCVTF   v28.4s, v28.4s
+          SCVTF   v29.4s, v29.4s
+          SCVTF   v30.4s, v30.4s
+          SCVTF   v31.4s, v31.4s
+
+          $if CHANNELWISE:
+            LDR     q6, [x5], 16
+            FMUL    v16.4s, v16.4s, v4.4s
+            FMUL    v17.4s, v17.4s, v4.4s
+            FMUL    v18.4s, v18.4s, v4.4s
+            FMUL    v19.4s, v19.4s, v4.4s
+            FMUL    v20.4s, v20.4s, v5.4s
+            LDR     q4, [x5], 16
+            FMUL    v21.4s, v21.4s, v5.4s
+            FMUL    v22.4s, v22.4s, v5.4s
+            FMUL    v23.4s, v23.4s, v5.4s
+            FMUL    v24.4s, v24.4s, v6.4s
+            FMUL    v25.4s, v25.4s, v6.4s
+            FMUL    v26.4s, v26.4s, v6.4s
+            FMUL    v27.4s, v27.4s, v6.4s
+            FMUL    v28.4s, v28.4s, v4.4s
+            FMUL    v29.4s, v29.4s, v4.4s
+            FMUL    v30.4s, v30.4s, v4.4s
+            FMUL    v31.4s, v31.4s, v4.4s
+          $else:
+            FMUL    v16.4s, v16.4s, v4.4s
+            FMUL    v17.4s, v17.4s, v4.4s
+            FMUL    v18.4s, v18.4s, v4.4s
+            FMUL    v19.4s, v19.4s, v4.4s
+            FMUL    v20.4s, v20.4s, v4.4s
+            FMUL    v21.4s, v21.4s, v4.4s
+            FMUL    v22.4s, v22.4s, v4.4s
+            FMUL    v23.4s, v23.4s, v4.4s
+            FMUL    v24.4s, v24.4s, v4.4s
+            FMUL    v25.4s, v25.4s, v4.4s
+            FMUL    v26.4s, v26.4s, v4.4s
+            FMUL    v27.4s, v27.4s, v4.4s
+            FMUL    v28.4s, v28.4s, v4.4s
+            FMUL    v29.4s, v29.4s, v4.4s
+            FMUL    v30.4s, v30.4s, v4.4s
+            FMUL    v31.4s, v31.4s, v4.4s
+
+          FCVTNS  v16.4s, v16.4s
+          FCVTNS  v17.4s, v17.4s
+          FCVTNS  v18.4s, v18.4s
+          FCVTNS  v19.4s, v19.4s
+          FCVTNS  v20.4s, v20.4s
+          FCVTNS  v21.4s, v21.4s
+          FCVTNS  v22.4s, v22.4s
+          FCVTNS  v23.4s, v23.4s
+          FCVTNS  v24.4s, v24.4s
+          FCVTNS  v25.4s, v25.4s
+          FCVTNS  v26.4s, v26.4s
+          FCVTNS  v27.4s, v27.4s
+          FCVTNS  v28.4s, v28.4s
+          FCVTNS  v29.4s, v29.4s
+          FCVTNS  v30.4s, v30.4s
+          FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2        // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1       // clamp min value
+
+        ${SQXTXN}   v0.8b, v16.8h
+        ${SQXTXN}   v1.8b, v17.8h
+        ${SQXTXN}   v2.8b, v18.8h
+        ${SQXTXN}   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]          // clamp max value
+        ${SQXTXN2}  v0.16b, v24.8h
+        ${SQXTXN2}  v1.16b, v25.8h
+        ${SQXTXN2}  v2.16b, v26.8h
+        ${SQXTXN2}  v3.16b, v27.8h
+        SUB     x11, x11, ${REWIND_DECREMENT}             // rewind params pointer
+
+        ${XMAX}    v0.16b, v0.16b, v4.16b
+        ${XMAX}    v1.16b, v1.16b, v4.16b
+        ${XMAX}    v2.16b, v2.16b, v4.16b
+        ${XMAX}    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        ${XMIN}    v0.16b, v0.16b, v5.16b
+        ${XMIN}    v1.16b, v1.16b, v5.16b
+        ${XMIN}    v2.16b, v2.16b, v5.16b
+        ${XMIN}    v3.16b, v3.16b, v5.16b
+        B.LO    5f
+
+        # Store full 4 x 16
+        ST1     {v3.16b},  [x7], x10
+        ST1     {v2.16b}, [x17], x10
+        ST1     {v1.16b}, [x16], x10
+        ST1     {v0.16b},  [x6], x10
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+4:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b}, [x13], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], x0
+        LD1     {v2.8b}, [x15], x0
+        LD1     {v3.8b}, [x20], x0
+        ${XXTL}    v0.8h, v0.8b
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        ${XXTL}    v1.8h, v1.8b
+        ${XXTL}    v2.8h, v2.8b
+        ${XXTL}    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        $if DATATYPE == "QU8":
+          USUBL   v4.8h, v4.8b, v7.8b
+        $else:
+          SXTL    v4.8h, v4.8b
+        $if DATATYPE == "QU8":
+          USUBL   v5.8h, v5.8b, v7.8b
+        $else:
+          SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 3, 6f
+        STR     d3, [x7], 8
+        STR     d2, [x17], 8
+        DUP     d3, v3.d[1]
+        DUP     d2, v2.d[1]
+        STR     d1, [x16], 8
+        STR     d0, [x6], 8
+        DUP     d1, v1.d[1]
+        DUP     d0, v0.d[1]
+6:
+        TBZ     x1, 2, 7f
+        STR     s3, [x7], 4
+        STR     s2, [x17], 4
+        DUP     s3, v3.s[1]
+        DUP     s2, v2.s[1]
+        STR     s1, [x16], 4
+        STR     s0, [x6], 4
+        DUP     s1, v1.s[1]
+        DUP     s0, v0.s[1]
+7:
+        TBZ     x1, 1, 8f
+        STR     h3, [x7], 2
+        STR     h2, [x17], 2
+        DUP     h3, v3.h[1]
+        DUP     h2, v2.h[1]
+        STR     h1, [x16], 2
+        STR     h0, [x6], 2
+        DUP     h1, v1.h[1]
+        DUP     h0, v0.h[1]
+8:
+        TBZ     x1, 0, 9f
+        STR     b3, [x7]
+        STR     b2, [x17]
+        STR     b1, [x16]
+        STR     b0, [x6]
+9:
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+END_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..42794c2
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,615 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t** restrict a, x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x8
+#     const int8_t* zero,                [sp + 16] -> x12
+#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0
+# A1  x14  v1
+# A2  x15  v2
+# A3  x20  v3
+# B    x5  v4  v5  v6
+# C0   x6 v16 v20 v24 v28
+# C1  x16 v17 v21 v25 v29
+# C2  x17 v18 v22 v26 v30
+# C3   x7 v19 v23 v27 v31
+# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x20, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x8            // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x8            // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x8            // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x20, x12                // if a3 == zero
+        ADD     x20, x20, x8            // a3 += a_offset
+        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 8 bytes for main loop?
+        SUBS    x0, x2, 8               // k = kc - 8
+        B.LO    4f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+2:
+        LD1     {v0.8b}, [x13], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], 8
+        LD1     {v2.8b}, [x15], 8
+        LD1     {v3.8b}, [x20], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    2b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 4f
+
+3:
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        # Apply params - scale, bias and clamp
+        LD1R    {v4.4s}, [x11], 4
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        FMUL    v16.4s, v16.4s, v4.4s
+        FMUL    v17.4s, v17.4s, v4.4s
+        FMUL    v18.4s, v18.4s, v4.4s
+        FMUL    v19.4s, v19.4s, v4.4s
+        FMUL    v20.4s, v20.4s, v4.4s
+        FMUL    v21.4s, v21.4s, v4.4s
+        FMUL    v22.4s, v22.4s, v4.4s
+        FMUL    v23.4s, v23.4s, v4.4s
+        FMUL    v24.4s, v24.4s, v4.4s
+        FMUL    v25.4s, v25.4s, v4.4s
+        FMUL    v26.4s, v26.4s, v4.4s
+        FMUL    v27.4s, v27.4s, v4.4s
+        FMUL    v28.4s, v28.4s, v4.4s
+        FMUL    v29.4s, v29.4s, v4.4s
+        FMUL    v30.4s, v30.4s, v4.4s
+        FMUL    v31.4s, v31.4s, v4.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2        // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1       // clamp min value
+
+        SQXTN   v0.8b, v16.8h
+        SQXTN   v1.8b, v17.8h
+        SQXTN   v2.8b, v18.8h
+        SQXTN   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]          // clamp max value
+        SQXTN2  v0.16b, v24.8h
+        SQXTN2  v1.16b, v25.8h
+        SQXTN2  v2.16b, v26.8h
+        SQXTN2  v3.16b, v27.8h
+        SUB     x11, x11, 7             // rewind params pointer
+
+        SMAX    v0.16b, v0.16b, v4.16b
+        SMAX    v1.16b, v1.16b, v4.16b
+        SMAX    v2.16b, v2.16b, v4.16b
+        SMAX    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        SMIN    v0.16b, v0.16b, v5.16b
+        SMIN    v1.16b, v1.16b, v5.16b
+        SMIN    v2.16b, v2.16b, v5.16b
+        SMIN    v3.16b, v3.16b, v5.16b
+        B.LO    5f
+
+        # Store full 4 x 16
+        ST1     {v3.16b},  [x7], x10
+        ST1     {v2.16b}, [x17], x10
+        ST1     {v1.16b}, [x16], x10
+        ST1     {v0.16b},  [x6], x10
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+4:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b}, [x13], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], x0
+        LD1     {v2.8b}, [x15], x0
+        LD1     {v3.8b}, [x20], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 3, 6f
+        STR     d3, [x7], 8
+        STR     d2, [x17], 8
+        DUP     d3, v3.d[1]
+        DUP     d2, v2.d[1]
+        STR     d1, [x16], 8
+        STR     d0, [x6], 8
+        DUP     d1, v1.d[1]
+        DUP     d0, v0.d[1]
+6:
+        TBZ     x1, 2, 7f
+        STR     s3, [x7], 4
+        STR     s2, [x17], 4
+        DUP     s3, v3.s[1]
+        DUP     s2, v2.s[1]
+        STR     s1, [x16], 4
+        STR     s0, [x6], 4
+        DUP     s1, v1.s[1]
+        DUP     s0, v0.s[1]
+7:
+        TBZ     x1, 1, 8f
+        STR     h3, [x7], 2
+        STR     h2, [x17], 2
+        DUP     h3, v3.h[1]
+        DUP     h2, v2.h[1]
+        STR     h1, [x16], 2
+        STR     h0, [x6], 2
+        DUP     h1, v1.h[1]
+        DUP     h0, v0.h[1]
+8:
+        TBZ     x1, 0, 9f
+        STR     b3, [x7]
+        STR     b2, [x17]
+        STR     b1, [x16]
+        STR     b0, [x6]
+9:
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..b9cda9e
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,621 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t** restrict a, x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x8
+#     const int8_t* zero,                [sp + 16] -> x12
+#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0
+# A1  x14  v1
+# A2  x15  v2
+# A3  x20  v3
+# B    x5  v4  v5  v6
+# C0   x6 v16 v20 v24 v28
+# C1  x16 v17 v21 v25 v29
+# C2  x17 v18 v22 v26 v30
+# C3   x7 v19 v23 v27 v31
+# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x20, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x8            // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x8            // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x8            // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x20, x12                // if a3 == zero
+        ADD     x20, x20, x8            // a3 += a_offset
+        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 8 bytes for main loop?
+        SUBS    x0, x2, 8               // k = kc - 8
+        B.LO    4f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+2:
+        LD1     {v0.8b}, [x13], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], 8
+        LD1     {v2.8b}, [x15], 8
+        LD1     {v3.8b}, [x20], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x13, 128]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x14, 128]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x15, 128]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x20, 128]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 448]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 512]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    2b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 4f
+
+3:
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        SCVTF   v16.4s, v16.4s
+        SCVTF   v17.4s, v17.4s
+        # Apply params - scale, bias and clamp
+        LD1R    {v4.4s}, [x11], 4
+        SCVTF   v18.4s, v18.4s
+        SCVTF   v19.4s, v19.4s
+        SCVTF   v20.4s, v20.4s
+        SCVTF   v21.4s, v21.4s
+        SCVTF   v22.4s, v22.4s
+        SCVTF   v23.4s, v23.4s
+        SCVTF   v24.4s, v24.4s
+        SCVTF   v25.4s, v25.4s
+        SCVTF   v26.4s, v26.4s
+        SCVTF   v27.4s, v27.4s
+        SCVTF   v28.4s, v28.4s
+        SCVTF   v29.4s, v29.4s
+        SCVTF   v30.4s, v30.4s
+        SCVTF   v31.4s, v31.4s
+
+        FMUL    v16.4s, v16.4s, v4.4s
+        FMUL    v17.4s, v17.4s, v4.4s
+        FMUL    v18.4s, v18.4s, v4.4s
+        FMUL    v19.4s, v19.4s, v4.4s
+        FMUL    v20.4s, v20.4s, v4.4s
+        FMUL    v21.4s, v21.4s, v4.4s
+        FMUL    v22.4s, v22.4s, v4.4s
+        FMUL    v23.4s, v23.4s, v4.4s
+        FMUL    v24.4s, v24.4s, v4.4s
+        FMUL    v25.4s, v25.4s, v4.4s
+        FMUL    v26.4s, v26.4s, v4.4s
+        FMUL    v27.4s, v27.4s, v4.4s
+        FMUL    v28.4s, v28.4s, v4.4s
+        FMUL    v29.4s, v29.4s, v4.4s
+        FMUL    v30.4s, v30.4s, v4.4s
+        FMUL    v31.4s, v31.4s, v4.4s
+
+        FCVTNS  v16.4s, v16.4s
+        FCVTNS  v17.4s, v17.4s
+        FCVTNS  v18.4s, v18.4s
+        FCVTNS  v19.4s, v19.4s
+        FCVTNS  v20.4s, v20.4s
+        FCVTNS  v21.4s, v21.4s
+        FCVTNS  v22.4s, v22.4s
+        FCVTNS  v23.4s, v23.4s
+        FCVTNS  v24.4s, v24.4s
+        FCVTNS  v25.4s, v25.4s
+        FCVTNS  v26.4s, v26.4s
+        FCVTNS  v27.4s, v27.4s
+        FCVTNS  v28.4s, v28.4s
+        FCVTNS  v29.4s, v29.4s
+        FCVTNS  v30.4s, v30.4s
+        FCVTNS  v31.4s, v31.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2        // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1       // clamp min value
+
+        SQXTN   v0.8b, v16.8h
+        SQXTN   v1.8b, v17.8h
+        SQXTN   v2.8b, v18.8h
+        SQXTN   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]          // clamp max value
+        SQXTN2  v0.16b, v24.8h
+        SQXTN2  v1.16b, v25.8h
+        SQXTN2  v2.16b, v26.8h
+        SQXTN2  v3.16b, v27.8h
+        SUB     x11, x11, 7             // rewind params pointer
+
+        SMAX    v0.16b, v0.16b, v4.16b
+        SMAX    v1.16b, v1.16b, v4.16b
+        SMAX    v2.16b, v2.16b, v4.16b
+        SMAX    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        SMIN    v0.16b, v0.16b, v5.16b
+        SMIN    v1.16b, v1.16b, v5.16b
+        SMIN    v2.16b, v2.16b, v5.16b
+        SMIN    v3.16b, v3.16b, v5.16b
+        B.LO    5f
+
+        # Store full 4 x 16
+        ST1     {v3.16b},  [x7], x10
+        ST1     {v2.16b}, [x17], x10
+        ST1     {v1.16b}, [x16], x10
+        ST1     {v0.16b},  [x6], x10
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+4:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b}, [x13], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], x0
+        LD1     {v2.8b}, [x15], x0
+        LD1     {v3.8b}, [x20], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 3, 6f
+        STR     d3, [x7], 8
+        STR     d2, [x17], 8
+        DUP     d3, v3.d[1]
+        DUP     d2, v2.d[1]
+        STR     d1, [x16], 8
+        STR     d0, [x6], 8
+        DUP     d1, v1.d[1]
+        DUP     d0, v0.d[1]
+6:
+        TBZ     x1, 2, 7f
+        STR     s3, [x7], 4
+        STR     s2, [x17], 4
+        DUP     s3, v3.s[1]
+        DUP     s2, v2.s[1]
+        STR     s1, [x16], 4
+        STR     s0, [x6], 4
+        DUP     s1, v1.s[1]
+        DUP     s0, v0.s[1]
+7:
+        TBZ     x1, 1, 8f
+        STR     h3, [x7], 2
+        STR     h2, [x17], 2
+        DUP     h3, v3.h[1]
+        DUP     h2, v2.h[1]
+        STR     h1, [x16], 2
+        STR     h0, [x6], 2
+        DUP     h1, v1.h[1]
+        DUP     h0, v0.h[1]
+8:
+        TBZ     x1, 0, 9f
+        STR     b3, [x7]
+        STR     b2, [x17]
+        STR     b1, [x16]
+        STR     b0, [x6]
+9:
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..201f541
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,615 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t** restrict a, x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x8
+#     const int8_t* zero,                [sp + 16] -> x12
+#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0
+# A1  x14  v1
+# A2  x15  v2
+# A3  x20  v3
+# B    x5  v4  v5  v6
+# C0   x6 v16 v20 v24 v28
+# C1  x16 v17 v21 v25 v29
+# C2  x17 v18 v22 v26 v30
+# C3   x7 v19 v23 v27 v31
+# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x20, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x8            // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x8            // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x8            // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x20, x12                // if a3 == zero
+        ADD     x20, x20, x8            // a3 += a_offset
+        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 8 bytes for main loop?
+        SUBS    x0, x2, 8               // k = kc - 8
+        B.LO    4f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+2:
+        LD1     {v0.8b}, [x13], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], 8
+        LD1     {v2.8b}, [x15], 8
+        LD1     {v3.8b}, [x20], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    2b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 4f
+
+3:
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        # Apply params - preshift, scale, postshift, bias and clamp
+        LD1R    {v4.4s}, [x11], 4
+        SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
+        SSHL    v17.4s, v17.4s, v4.4s
+        SSHL    v18.4s, v18.4s, v4.4s
+        SSHL    v19.4s, v19.4s, v4.4s
+        SSHL    v20.4s, v20.4s, v4.4s
+        SSHL    v21.4s, v21.4s, v4.4s
+        SSHL    v22.4s, v22.4s, v4.4s
+        SSHL    v23.4s, v23.4s, v4.4s
+        LD1R    {v5.4s}, [x11], 4
+        SSHL    v24.4s, v24.4s, v4.4s
+        SSHL    v25.4s, v25.4s, v4.4s
+        SSHL    v26.4s, v26.4s, v4.4s
+        SSHL    v27.4s, v27.4s, v4.4s
+        SSHL    v28.4s, v28.4s, v4.4s
+        SSHL    v29.4s, v29.4s, v4.4s
+        SSHL    v30.4s, v30.4s, v4.4s
+        SSHL    v31.4s, v31.4s, v4.4s
+        LD1R    {v6.4s}, [x11], 4
+        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
+        SQDMULH v17.4s, v17.4s, v5.4s
+        SQDMULH v18.4s, v18.4s, v5.4s
+        SQDMULH v19.4s, v19.4s, v5.4s
+        SQDMULH v20.4s, v20.4s, v5.4s
+        SQDMULH v21.4s, v21.4s, v5.4s
+        SQDMULH v22.4s, v22.4s, v5.4s
+        SQDMULH v23.4s, v23.4s, v5.4s
+        SQDMULH v24.4s, v24.4s, v5.4s
+        SQDMULH v25.4s, v25.4s, v5.4s
+        SQDMULH v26.4s, v26.4s, v5.4s
+        SQDMULH v27.4s, v27.4s, v5.4s
+        SQDMULH v28.4s, v28.4s, v5.4s
+        SQDMULH v29.4s, v29.4s, v5.4s
+        SQDMULH v30.4s, v30.4s, v5.4s
+        SQDMULH v31.4s, v31.4s, v5.4s
+        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
+        SRSHL   v17.4s, v17.4s, v6.4s
+        SRSHL   v18.4s, v18.4s, v6.4s
+        SRSHL   v19.4s, v19.4s, v6.4s
+        SRSHL   v20.4s, v20.4s, v6.4s
+        SRSHL   v21.4s, v21.4s, v6.4s
+        SRSHL   v22.4s, v22.4s, v6.4s
+        SRSHL   v23.4s, v23.4s, v6.4s
+        SRSHL   v24.4s, v24.4s, v6.4s
+        SRSHL   v25.4s, v25.4s, v6.4s
+        SRSHL   v26.4s, v26.4s, v6.4s
+        SRSHL   v27.4s, v27.4s, v6.4s
+        SRSHL   v28.4s, v28.4s, v6.4s
+        SRSHL   v29.4s, v29.4s, v6.4s
+        SRSHL   v30.4s, v30.4s, v6.4s
+        SRSHL   v31.4s, v31.4s, v6.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2        // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1       // clamp min value
+
+        SQXTN   v0.8b, v16.8h
+        SQXTN   v1.8b, v17.8h
+        SQXTN   v2.8b, v18.8h
+        SQXTN   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]          // clamp max value
+        SQXTN2  v0.16b, v24.8h
+        SQXTN2  v1.16b, v25.8h
+        SQXTN2  v2.16b, v26.8h
+        SQXTN2  v3.16b, v27.8h
+        SUB     x11, x11, 15             // rewind params pointer
+
+        SMAX    v0.16b, v0.16b, v4.16b
+        SMAX    v1.16b, v1.16b, v4.16b
+        SMAX    v2.16b, v2.16b, v4.16b
+        SMAX    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        SMIN    v0.16b, v0.16b, v5.16b
+        SMIN    v1.16b, v1.16b, v5.16b
+        SMIN    v2.16b, v2.16b, v5.16b
+        SMIN    v3.16b, v3.16b, v5.16b
+        B.LO    5f
+
+        # Store full 4 x 16
+        ST1     {v3.16b},  [x7], x10
+        ST1     {v2.16b}, [x17], x10
+        ST1     {v1.16b}, [x16], x10
+        ST1     {v0.16b},  [x6], x10
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+4:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b}, [x13], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], x0
+        LD1     {v2.8b}, [x15], x0
+        LD1     {v3.8b}, [x20], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 3, 6f
+        STR     d3, [x7], 8
+        STR     d2, [x17], 8
+        DUP     d3, v3.d[1]
+        DUP     d2, v2.d[1]
+        STR     d1, [x16], 8
+        STR     d0, [x6], 8
+        DUP     d1, v1.d[1]
+        DUP     d0, v0.d[1]
+6:
+        TBZ     x1, 2, 7f
+        STR     s3, [x7], 4
+        STR     s2, [x17], 4
+        DUP     s3, v3.s[1]
+        DUP     s2, v2.s[1]
+        STR     s1, [x16], 4
+        STR     s0, [x6], 4
+        DUP     s1, v1.s[1]
+        DUP     s0, v0.s[1]
+7:
+        TBZ     x1, 1, 8f
+        STR     h3, [x7], 2
+        STR     h2, [x17], 2
+        DUP     h3, v3.h[1]
+        DUP     h2, v2.h[1]
+        STR     h1, [x16], 2
+        STR     h0, [x6], 2
+        DUP     h1, v1.h[1]
+        DUP     h0, v0.h[1]
+8:
+        TBZ     x1, 0, 9f
+        STR     b3, [x7]
+        STR     b2, [x17]
+        STR     b1, [x16]
+        STR     b0, [x6]
+9:
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..a9eabe1
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,621 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64(
+#     size_t mr,                 x0
+#     size_t nc,                 x1
+#     size_t kc,                 x2 / x0
+#     size_t ks,                 x3 / x9
+#     const int8_t** restrict a, x4
+#     const int8_t* restrict w,  x5
+#     int8_t* restrict c,        x6
+#     size_t cm_stride,          x7
+#     size_t cn_stride,                  [sp] -> x10
+#     size_t a_offset,                   [sp + 8] -> x8
+#     const int8_t* zero,                [sp + 16] -> x12
+#     const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0  x13  v0
+# A1  x14  v1
+# A2  x15  v2
+# A3  x20  v3
+# B    x5  v4  v5  v6
+# C0   x6 v16 v20 v24 v28
+# C1  x16 v17 v21 v25 v29
+# C2  x17 v18 v22 v26 v30
+# C3   x7 v19 v23 v27 v31
+# unused  v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+        # Clamp C pointers
+        CMP     x0, 2                   // if mr < 2
+        LDP     x10, x8, [sp]           // Load cn_stride, a_offset
+        ADD     x16, x6, x7             // c1 = c0 + cm_stride
+        CSEL    x16, x6,  x16, LO       //   c1 = c0
+
+        ADD     x17, x16, x7            // c2 = c1 + cm_stride
+        LDP     x12, x11, [sp, 16]      // Load zero, params pointer
+                                        // if mr <= 2
+        CSEL    x17, x16, x17, LS       //   c2 = c1
+
+        CMP     x0, 4                   // if mr < 4
+        STP     x20, x21, [sp, -16]!    // Save x20-x21 on stack
+        ADD     x7,  x17, x7            // c3 = c2 + cm_stride
+        CSEL    x7,  x17, x7, LO        //   c3 = c2
+
+
+        .p2align 3
+0:
+        # Load initial bias from w into accumulators
+        LDP     q16, q20, [x5], 32
+        MOV     v17.16b, v16.16b
+        MOV     v18.16b, v16.16b
+        LDP     q24, q28, [x5], 32
+        MOV     v19.16b, v16.16b
+        MOV     v21.16b, v20.16b
+        MOV     v22.16b, v20.16b
+        MOV     v23.16b, v20.16b
+        MOV     v25.16b, v24.16b
+        MOV     v26.16b, v24.16b
+        MOV     v27.16b, v24.16b
+        MOV     v29.16b, v28.16b
+        MOV     v30.16b, v28.16b
+        MOV     v31.16b, v28.16b
+        MOV     x9, x3                  // p = ks
+
+        .p2align 3
+1:
+        # Load next 4 A pointers
+        LDP     x13, x14, [x4], 16
+        LDP     x15, x20, [x4], 16
+
+        CMP     x13, x12                // if a0 == zero
+        ADD     x13, x13, x8            // a0 += a_offset
+        CSEL    x13, x12, x13, EQ       //   a0 = zero, else += a0 + a_offset
+        CMP     x14, x12                // if a1 == zero
+        ADD     x14, x14, x8            // a1 += a_offset
+        CSEL    x14, x12, x14, EQ       //   a1 = zero, else += a1 + a_offset
+        CMP     x15, x12                // if a2 == zero
+        ADD     x15, x15, x8            // a2 += a_offset
+        CSEL    x15, x12, x15, EQ       //   a2 = zero, else += a2 + a_offset
+        CMP     x20, x12                // if a3 == zero
+        ADD     x20, x20, x8            // a3 += a_offset
+        CSEL    x20, x12, x20, EQ       //   a3 = zero, else += a3 + a_offset
+
+        # Is there at least 8 bytes for main loop?
+        SUBS    x0, x2, 8               // k = kc - 8
+        B.LO    4f
+
+        # Main loop - 8 bytes of A
+        .p2align 3
+2:
+        LD1     {v0.8b}, [x13], 8
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], 8
+        LD1     {v2.8b}, [x15], 8
+        LD1     {v3.8b}, [x20], 8
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x13, 128]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        PRFM    PLDL1KEEP, [x14, 128]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x15, 128]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        PRFM    PLDL1KEEP, [x20, 128]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 448]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        PRFM    PLDL1KEEP, [x5, 512]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[7]
+        SMLAL2  v20.4s, v4.8h, v0.h[7]
+        SMLAL   v24.4s, v5.4h, v0.h[7]
+        SMLAL2  v28.4s, v5.8h, v0.h[7]
+        SMLAL   v17.4s, v4.4h, v1.h[7]
+        SMLAL2  v21.4s, v4.8h, v1.h[7]
+        SMLAL   v25.4s, v5.4h, v1.h[7]
+        SMLAL2  v29.4s, v5.8h, v1.h[7]
+        SMLAL   v18.4s, v4.4h, v2.h[7]
+        SMLAL2  v22.4s, v4.8h, v2.h[7]
+        SMLAL   v26.4s, v5.4h, v2.h[7]
+        SMLAL2  v30.4s, v5.8h, v2.h[7]
+        SMLAL   v19.4s, v4.4h, v3.h[7]
+        SMLAL2  v23.4s, v4.8h, v3.h[7]
+        SMLAL   v27.4s, v5.4h, v3.h[7]
+        SMLAL2  v31.4s, v5.8h, v3.h[7]
+
+        SUBS    x0, x0, 8
+        B.HS    2b
+
+        AND     x0, x2, 7               // kc remainder 0 to 7
+        # Is there a remainder?- 1 to 7 bytes of A
+        CBNZ    x0, 4f
+
+3:
+        # ks loop
+        SUBS    x9, x9, 32              // ks -= MR * sizeof(int8_t*)
+        B.HI    1b
+
+        # Apply params - preshift, scale, postshift, bias and clamp
+        LD1R    {v4.4s}, [x11], 4
+        SSHL    v16.4s, v16.4s, v4.4s   // shift to upper bits
+        SSHL    v17.4s, v17.4s, v4.4s
+        SSHL    v18.4s, v18.4s, v4.4s
+        SSHL    v19.4s, v19.4s, v4.4s
+        SSHL    v20.4s, v20.4s, v4.4s
+        SSHL    v21.4s, v21.4s, v4.4s
+        SSHL    v22.4s, v22.4s, v4.4s
+        SSHL    v23.4s, v23.4s, v4.4s
+        LD1R    {v5.4s}, [x11], 4
+        SSHL    v24.4s, v24.4s, v4.4s
+        SSHL    v25.4s, v25.4s, v4.4s
+        SSHL    v26.4s, v26.4s, v4.4s
+        SSHL    v27.4s, v27.4s, v4.4s
+        SSHL    v28.4s, v28.4s, v4.4s
+        SSHL    v29.4s, v29.4s, v4.4s
+        SSHL    v30.4s, v30.4s, v4.4s
+        SSHL    v31.4s, v31.4s, v4.4s
+        LD1R    {v6.4s}, [x11], 4
+        SQDMULH v16.4s, v16.4s, v5.4s   // scale without rounding
+        SQDMULH v17.4s, v17.4s, v5.4s
+        SQDMULH v18.4s, v18.4s, v5.4s
+        SQDMULH v19.4s, v19.4s, v5.4s
+        SQDMULH v20.4s, v20.4s, v5.4s
+        SQDMULH v21.4s, v21.4s, v5.4s
+        SQDMULH v22.4s, v22.4s, v5.4s
+        SQDMULH v23.4s, v23.4s, v5.4s
+        SQDMULH v24.4s, v24.4s, v5.4s
+        SQDMULH v25.4s, v25.4s, v5.4s
+        SQDMULH v26.4s, v26.4s, v5.4s
+        SQDMULH v27.4s, v27.4s, v5.4s
+        SQDMULH v28.4s, v28.4s, v5.4s
+        SQDMULH v29.4s, v29.4s, v5.4s
+        SQDMULH v30.4s, v30.4s, v5.4s
+        SQDMULH v31.4s, v31.4s, v5.4s
+        SRSHL   v16.4s, v16.4s, v6.4s   // signed rounding shift left
+        SRSHL   v17.4s, v17.4s, v6.4s
+        SRSHL   v18.4s, v18.4s, v6.4s
+        SRSHL   v19.4s, v19.4s, v6.4s
+        SRSHL   v20.4s, v20.4s, v6.4s
+        SRSHL   v21.4s, v21.4s, v6.4s
+        SRSHL   v22.4s, v22.4s, v6.4s
+        SRSHL   v23.4s, v23.4s, v6.4s
+        SRSHL   v24.4s, v24.4s, v6.4s
+        SRSHL   v25.4s, v25.4s, v6.4s
+        SRSHL   v26.4s, v26.4s, v6.4s
+        SRSHL   v27.4s, v27.4s, v6.4s
+        SRSHL   v28.4s, v28.4s, v6.4s
+        SRSHL   v29.4s, v29.4s, v6.4s
+        SRSHL   v30.4s, v30.4s, v6.4s
+        SRSHL   v31.4s, v31.4s, v6.4s
+
+        SQXTN   v16.4h, v16.4s
+        SQXTN   v17.4h, v17.4s
+        SQXTN   v18.4h, v18.4s
+        SQXTN   v19.4h, v19.4s
+        SQXTN   v24.4h, v24.4s
+        SQXTN   v25.4h, v25.4s
+        SQXTN   v26.4h, v26.4s
+        SQXTN   v27.4h, v27.4s
+        LD1R    {v6.8h}, [x11], 2        // add bias
+
+        SQXTN2  v16.8h, v20.4s
+        SQXTN2  v17.8h, v21.4s
+        SQXTN2  v18.8h, v22.4s
+        SQXTN2  v19.8h, v23.4s
+        SQXTN2  v24.8h, v28.4s
+        SQXTN2  v25.8h, v29.4s
+        SQXTN2  v26.8h, v30.4s
+        SQXTN2  v27.8h, v31.4s
+
+        SQADD   v16.8h, v16.8h, v6.8h
+        SQADD   v17.8h, v17.8h, v6.8h
+        SQADD   v18.8h, v18.8h, v6.8h
+        SQADD   v19.8h, v19.8h, v6.8h
+        SQADD   v24.8h, v24.8h, v6.8h
+        SQADD   v25.8h, v25.8h, v6.8h
+        SQADD   v26.8h, v26.8h, v6.8h
+        SQADD   v27.8h, v27.8h, v6.8h
+        LD1R    {v4.16b}, [x11], 1       // clamp min value
+
+        SQXTN   v0.8b, v16.8h
+        SQXTN   v1.8b, v17.8h
+        SQXTN   v2.8b, v18.8h
+        SQXTN   v3.8b, v19.8h
+        LD1R    {v5.16b}, [x11]          // clamp max value
+        SQXTN2  v0.16b, v24.8h
+        SQXTN2  v1.16b, v25.8h
+        SQXTN2  v2.16b, v26.8h
+        SQXTN2  v3.16b, v27.8h
+        SUB     x11, x11, 15             // rewind params pointer
+
+        SMAX    v0.16b, v0.16b, v4.16b
+        SMAX    v1.16b, v1.16b, v4.16b
+        SMAX    v2.16b, v2.16b, v4.16b
+        SMAX    v3.16b, v3.16b, v4.16b
+        SUBS    x1, x1, 16
+        SMIN    v0.16b, v0.16b, v5.16b
+        SMIN    v1.16b, v1.16b, v5.16b
+        SMIN    v2.16b, v2.16b, v5.16b
+        SMIN    v3.16b, v3.16b, v5.16b
+        B.LO    5f
+
+        # Store full 4 x 16
+        ST1     {v3.16b},  [x7], x10
+        ST1     {v2.16b}, [x17], x10
+        ST1     {v1.16b}, [x16], x10
+        ST1     {v0.16b},  [x6], x10
+
+        SUB     x4, x4, x3              // a -= ks
+
+        # nc loop
+        B.HI    0b
+
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+        # Remainder- 1 to 7 bytes of A
+        .p2align 3
+4:
+        AND     x0, x2, 7               // kc remainder 1 to 7
+
+        LD1     {v0.8b}, [x13], x0
+        LDP     d4, d5, [x5], 16
+        LD1     {v1.8b}, [x14], x0
+        LD1     {v2.8b}, [x15], x0
+        LD1     {v3.8b}, [x20], x0
+        SXTL    v0.8h, v0.8b
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SXTL    v1.8h, v1.8b
+        SXTL    v2.8h, v2.8b
+        SXTL    v3.8h, v3.8b
+        SMLAL   v16.4s, v4.4h, v0.h[0]
+        SMLAL2  v20.4s, v4.8h, v0.h[0]
+        SMLAL   v24.4s, v5.4h, v0.h[0]
+        SMLAL2  v28.4s, v5.8h, v0.h[0]
+        SMLAL   v17.4s, v4.4h, v1.h[0]
+        SMLAL2  v21.4s, v4.8h, v1.h[0]
+        SMLAL   v25.4s, v5.4h, v1.h[0]
+        SMLAL2  v29.4s, v5.8h, v1.h[0]
+        SMLAL   v18.4s, v4.4h, v2.h[0]
+        SMLAL2  v22.4s, v4.8h, v2.h[0]
+        SMLAL   v26.4s, v5.4h, v2.h[0]
+        SMLAL2  v30.4s, v5.8h, v2.h[0]
+        SMLAL   v19.4s, v4.4h, v3.h[0]
+        SMLAL2  v23.4s, v4.8h, v3.h[0]
+        SMLAL   v27.4s, v5.4h, v3.h[0]
+        SMLAL2  v31.4s, v5.8h, v3.h[0]
+        CMP     x0, 2
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[1]
+        SMLAL2  v20.4s, v4.8h, v0.h[1]
+        SMLAL   v24.4s, v5.4h, v0.h[1]
+        SMLAL2  v28.4s, v5.8h, v0.h[1]
+        SMLAL   v17.4s, v4.4h, v1.h[1]
+        SMLAL2  v21.4s, v4.8h, v1.h[1]
+        SMLAL   v25.4s, v5.4h, v1.h[1]
+        SMLAL2  v29.4s, v5.8h, v1.h[1]
+        SMLAL   v18.4s, v4.4h, v2.h[1]
+        SMLAL2  v22.4s, v4.8h, v2.h[1]
+        SMLAL   v26.4s, v5.4h, v2.h[1]
+        SMLAL2  v30.4s, v5.8h, v2.h[1]
+        SMLAL   v19.4s, v4.4h, v3.h[1]
+        SMLAL2  v23.4s, v4.8h, v3.h[1]
+        SMLAL   v27.4s, v5.4h, v3.h[1]
+        SMLAL2  v31.4s, v5.8h, v3.h[1]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[2]
+        SMLAL2  v20.4s, v4.8h, v0.h[2]
+        SMLAL   v24.4s, v5.4h, v0.h[2]
+        SMLAL2  v28.4s, v5.8h, v0.h[2]
+        SMLAL   v17.4s, v4.4h, v1.h[2]
+        SMLAL2  v21.4s, v4.8h, v1.h[2]
+        SMLAL   v25.4s, v5.4h, v1.h[2]
+        SMLAL2  v29.4s, v5.8h, v1.h[2]
+        SMLAL   v18.4s, v4.4h, v2.h[2]
+        SMLAL2  v22.4s, v4.8h, v2.h[2]
+        SMLAL   v26.4s, v5.4h, v2.h[2]
+        SMLAL2  v30.4s, v5.8h, v2.h[2]
+        SMLAL   v19.4s, v4.4h, v3.h[2]
+        SMLAL2  v23.4s, v4.8h, v3.h[2]
+        SMLAL   v27.4s, v5.4h, v3.h[2]
+        SMLAL2  v31.4s, v5.8h, v3.h[2]
+        CMP     x0, 4
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[3]
+        SMLAL2  v20.4s, v4.8h, v0.h[3]
+        SMLAL   v24.4s, v5.4h, v0.h[3]
+        SMLAL2  v28.4s, v5.8h, v0.h[3]
+        SMLAL   v17.4s, v4.4h, v1.h[3]
+        SMLAL2  v21.4s, v4.8h, v1.h[3]
+        SMLAL   v25.4s, v5.4h, v1.h[3]
+        SMLAL2  v29.4s, v5.8h, v1.h[3]
+        SMLAL   v18.4s, v4.4h, v2.h[3]
+        SMLAL2  v22.4s, v4.8h, v2.h[3]
+        SMLAL   v26.4s, v5.4h, v2.h[3]
+        SMLAL2  v30.4s, v5.8h, v2.h[3]
+        SMLAL   v19.4s, v4.4h, v3.h[3]
+        SMLAL2  v23.4s, v4.8h, v3.h[3]
+        SMLAL   v27.4s, v5.4h, v3.h[3]
+        SMLAL2  v31.4s, v5.8h, v3.h[3]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[4]
+        SMLAL2  v20.4s, v4.8h, v0.h[4]
+        SMLAL   v24.4s, v5.4h, v0.h[4]
+        SMLAL2  v28.4s, v5.8h, v0.h[4]
+        SMLAL   v17.4s, v4.4h, v1.h[4]
+        SMLAL2  v21.4s, v4.8h, v1.h[4]
+        SMLAL   v25.4s, v5.4h, v1.h[4]
+        SMLAL2  v29.4s, v5.8h, v1.h[4]
+        SMLAL   v18.4s, v4.4h, v2.h[4]
+        SMLAL2  v22.4s, v4.8h, v2.h[4]
+        SMLAL   v26.4s, v5.4h, v2.h[4]
+        SMLAL2  v30.4s, v5.8h, v2.h[4]
+        SMLAL   v19.4s, v4.4h, v3.h[4]
+        SMLAL2  v23.4s, v4.8h, v3.h[4]
+        SMLAL   v27.4s, v5.4h, v3.h[4]
+        SMLAL2  v31.4s, v5.8h, v3.h[4]
+        CMP     x0, 6
+        B.LO    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[5]
+        SMLAL2  v20.4s, v4.8h, v0.h[5]
+        SMLAL   v24.4s, v5.4h, v0.h[5]
+        SMLAL2  v28.4s, v5.8h, v0.h[5]
+        SMLAL   v17.4s, v4.4h, v1.h[5]
+        SMLAL2  v21.4s, v4.8h, v1.h[5]
+        SMLAL   v25.4s, v5.4h, v1.h[5]
+        SMLAL2  v29.4s, v5.8h, v1.h[5]
+        SMLAL   v18.4s, v4.4h, v2.h[5]
+        SMLAL2  v22.4s, v4.8h, v2.h[5]
+        SMLAL   v26.4s, v5.4h, v2.h[5]
+        SMLAL2  v30.4s, v5.8h, v2.h[5]
+        SMLAL   v19.4s, v4.4h, v3.h[5]
+        SMLAL2  v23.4s, v4.8h, v3.h[5]
+        SMLAL   v27.4s, v5.4h, v3.h[5]
+        SMLAL2  v31.4s, v5.8h, v3.h[5]
+        B.EQ    3b
+
+        LDP     d4, d5, [x5], 16
+        SXTL    v4.8h, v4.8b
+        SXTL    v5.8h, v5.8b
+        SMLAL   v16.4s, v4.4h, v0.h[6]
+        SMLAL2  v20.4s, v4.8h, v0.h[6]
+        SMLAL   v24.4s, v5.4h, v0.h[6]
+        SMLAL2  v28.4s, v5.8h, v0.h[6]
+        SMLAL   v17.4s, v4.4h, v1.h[6]
+        SMLAL2  v21.4s, v4.8h, v1.h[6]
+        SMLAL   v25.4s, v5.4h, v1.h[6]
+        SMLAL2  v29.4s, v5.8h, v1.h[6]
+        SMLAL   v18.4s, v4.4h, v2.h[6]
+        SMLAL2  v22.4s, v4.8h, v2.h[6]
+        SMLAL   v26.4s, v5.4h, v2.h[6]
+        SMLAL2  v30.4s, v5.8h, v2.h[6]
+        SMLAL   v19.4s, v4.4h, v3.h[6]
+        SMLAL2  v23.4s, v4.8h, v3.h[6]
+        SMLAL   v27.4s, v5.4h, v3.h[6]
+        SMLAL2  v31.4s, v5.8h, v3.h[6]
+        B       3b
+
+        # Store odd width
+        .p2align 3
+5:
+        TBZ     x1, 3, 6f
+        STR     d3, [x7], 8
+        STR     d2, [x17], 8
+        DUP     d3, v3.d[1]
+        DUP     d2, v2.d[1]
+        STR     d1, [x16], 8
+        STR     d0, [x6], 8
+        DUP     d1, v1.d[1]
+        DUP     d0, v0.d[1]
+6:
+        TBZ     x1, 2, 7f
+        STR     s3, [x7], 4
+        STR     s2, [x17], 4
+        DUP     s3, v3.s[1]
+        DUP     s2, v2.s[1]
+        STR     s1, [x16], 4
+        STR     s0, [x6], 4
+        DUP     s1, v1.s[1]
+        DUP     s0, v0.s[1]
+7:
+        TBZ     x1, 1, 8f
+        STR     h3, [x7], 2
+        STR     h2, [x17], 2
+        DUP     h3, v3.h[1]
+        DUP     h2, v2.h[1]
+        STR     h1, [x16], 2
+        STR     h0, [x6], 2
+        DUP     h1, v1.h[1]
+        DUP     h0, v0.h[1]
+8:
+        TBZ     x1, 0, 9f
+        STR     b3, [x7]
+        STR     b2, [x17]
+        STR     b1, [x16]
+        STR     b0, [x6]
+9:
+        # Restore x20-x21 from stack
+        LDP     x20, x21, [sp], 16
+        RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 236898f..81c4ed4 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -1029,15 +1029,21 @@
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal)
 
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
-
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32)
@@ -1390,6 +1396,9 @@
 DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+
 DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32)
 DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64)
 DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 1c47ddc..935ec5f 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -851,15 +851,21 @@
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal)
 
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
-
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55)
@@ -1146,6 +1152,9 @@
 DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
 DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
 
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+
 DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64)
 DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128)
 DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55)
diff --git a/test/qc8-gemm-minmax-fp32.cc b/test/qc8-gemm-minmax-fp32.cc
index f36b52b..4413640 100644
--- a/test/qc8-gemm-minmax-fp32.cc
+++ b/test/qc8-gemm-minmax-fp32.cc
@@ -21911,6 +21911,918 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
diff --git a/test/qc8-gemm-minmax-fp32.yaml b/test/qc8-gemm-minmax-fp32.yaml
index c7637ca..b2ca2a3 100644
--- a/test/qc8-gemm-minmax-fp32.yaml
+++ b/test/qc8-gemm-minmax-fp32.yaml
@@ -147,6 +147,12 @@
 - name: xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
   init: xnn_init_qs8_minmax_neon_params
   k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+  init: xnn_init_qs8_minmax_neon_params
+  k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+  init: xnn_init_qs8_minmax_neon_params
+  k-block: 8
 - name: xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32
   init: xnn_init_qs8_minmax_neon_params
   k-block: 4
diff --git a/test/qc8-igemm-minmax-fp32.cc b/test/qc8-igemm-minmax-fp32.cc
index 035e068..850afa7 100644
--- a/test/qc8-igemm-minmax-fp32.cc
+++ b/test/qc8-igemm-minmax-fp32.cc
@@ -21083,7 +21083,7 @@
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21093,10 +21093,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21107,10 +21107,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -21123,12 +21123,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -21140,11 +21140,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -21156,11 +21156,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -21171,11 +21171,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21189,13 +21189,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21206,11 +21206,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21224,13 +21224,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -21241,11 +21241,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21259,13 +21259,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21277,12 +21277,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21295,12 +21295,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21314,13 +21314,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21332,12 +21332,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21350,12 +21350,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21369,13 +21369,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -21387,11 +21387,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21406,13 +21406,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21425,12 +21425,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21443,12 +21443,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21463,13 +21463,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -21482,11 +21482,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21501,12 +21501,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21517,10 +21517,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21531,10 +21531,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21545,13 +21545,13 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
 
 #if XNN_ARCH_ARM64
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21561,10 +21561,10 @@
       .m(4)
       .n(16)
       .k(8)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21575,10 +21575,10 @@
       .n(16)
       .k(8)
       .cn_stride(19)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       for (uint32_t n = 1; n <= 16; n++) {
@@ -21591,12 +21591,12 @@
           .n(n)
           .k(8)
           .iterations(1)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile_m) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t m = 1; m <= 4; m++) {
       GemmMicrokernelTester()
@@ -21608,11 +21608,11 @@
         .n(16)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile_n) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 1; n <= 16; n++) {
       GemmMicrokernelTester()
@@ -21624,11 +21624,11 @@
         .n(n)
         .k(8)
         .iterations(1)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_lt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       GemmMicrokernelTester()
@@ -21639,11 +21639,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_lt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k < 8; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21657,13 +21657,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_gt_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       GemmMicrokernelTester()
@@ -21674,11 +21674,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_gt_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 9; k < 16; k++) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21692,13 +21692,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_div_8) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       GemmMicrokernelTester()
@@ -21709,11 +21709,11 @@
         .m(4)
         .n(16)
         .k(k)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_div_8_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 16; k <= 80; k += 8) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21727,13 +21727,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_gt_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21745,12 +21745,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_gt_16_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21763,12 +21763,12 @@
           .n(16)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_gt_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21782,13 +21782,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_div_16) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21800,12 +21800,12 @@
           .m(4)
           .n(16)
           .k(k)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_div_16_strided_cn) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21818,12 +21818,12 @@
           .n(n)
           .k(k)
           .cn_stride(19)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_div_16_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21837,13 +21837,13 @@
             .n(n)
             .k(k)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -21855,11 +21855,11 @@
         .n(16)
         .k(k)
         .ks(3)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, small_kernel_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21874,13 +21874,13 @@
             .k(k)
             .ks(3)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_gt_16_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 17; n < 32; n++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21893,12 +21893,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_div_16_small_kernel) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_small_kernel) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t n = 32; n <= 48; n += 16) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21911,12 +21911,12 @@
           .n(16)
           .k(k)
           .ks(3)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, strided_cm_subtile) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       for (uint32_t m = 1; m <= 4; m++) {
@@ -21931,13 +21931,13 @@
             .k(k)
             .cm_stride(19)
             .iterations(1)
-            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+            .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
         }
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, a_offset) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, a_offset) {
     TEST_REQUIRES_ARM_NEON;
     for (size_t k = 1; k <= 40; k += 9) {
       GemmMicrokernelTester()
@@ -21950,11 +21950,11 @@
         .k(k)
         .ks(3)
         .a_offset(163)
-        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, zero) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, zero) {
     TEST_REQUIRES_ARM_NEON;
     for (uint32_t mz = 0; mz < 4; mz++) {
       for (size_t k = 1; k <= 40; k += 9) {
@@ -21969,12 +21969,12 @@
           .ks(3)
           .a_offset(163)
           .zero_index(mz)
-          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+          .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
       }
     }
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, qmin) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21985,10 +21985,10 @@
       .n(16)
       .k(8)
       .qmin(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, qmax) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -21999,10 +21999,10 @@
       .n(16)
       .k(8)
       .qmax(128)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 
-  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, strided_cm) {
+  TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
     TEST_REQUIRES_ARM_NEON;
     GemmMicrokernelTester()
       .mr(4)
@@ -22013,7 +22013,7 @@
       .n(16)
       .k(8)
       .cm_stride(19)
-      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
   }
 #endif  // XNN_ARCH_ARM64
 
diff --git a/test/qc8-igemm-minmax-fp32.yaml b/test/qc8-igemm-minmax-fp32.yaml
index 97a9aa7..ce31853 100644
--- a/test/qc8-igemm-minmax-fp32.yaml
+++ b/test/qc8-igemm-minmax-fp32.yaml
@@ -138,10 +138,10 @@
 - name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
   init: xnn_init_qs8_minmax_neon_params
   k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
   init: xnn_init_qs8_minmax_neon_params
   k-block: 8
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
   init: xnn_init_qs8_minmax_neon_params
   k-block: 8
 - name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
diff --git a/test/qs8-gemm-minmax-fp32.cc b/test/qs8-gemm-minmax-fp32.cc
index 7447cf5..0983a12 100644
--- a/test/qs8-gemm-minmax-fp32.cc
+++ b/test/qs8-gemm-minmax-fp32.cc
@@ -21911,6 +21911,918 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
diff --git a/test/qs8-gemm-minmax-fp32.yaml b/test/qs8-gemm-minmax-fp32.yaml
index 2e33bed..32df48e 100644
--- a/test/qs8-gemm-minmax-fp32.yaml
+++ b/test/qs8-gemm-minmax-fp32.yaml
@@ -147,6 +147,12 @@
 - name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 8
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 8
 - name: xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 4
diff --git a/test/qs8-gemm-minmax-rndnu.cc b/test/qs8-gemm-minmax-rndnu.cc
index 7fc86d4..f7fcb6b 100644
--- a/test/qs8-gemm-minmax-rndnu.cc
+++ b/test/qs8-gemm-minmax-rndnu.cc
@@ -71159,6 +71159,918 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .a_stride(11)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(11)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(19)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .a_stride(83)
+        .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_a) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .a_stride(43)
+          .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
diff --git a/test/qs8-gemm-minmax-rndnu.yaml b/test/qs8-gemm-minmax-rndnu.yaml
index e737786..f514217 100644
--- a/test/qs8-gemm-minmax-rndnu.yaml
+++ b/test/qs8-gemm-minmax-rndnu.yaml
@@ -471,6 +471,12 @@
 - name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+  init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+  k-block: 8
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+  init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+  k-block: 8
 - name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld32
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 4
diff --git a/test/qs8-igemm-minmax-fp32.cc b/test/qs8-igemm-minmax-fp32.cc
index 7e73ff8..09d3bde 100644
--- a/test/qs8-igemm-minmax-fp32.cc
+++ b/test/qs8-igemm-minmax-fp32.cc
@@ -22019,6 +22019,942 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
diff --git a/test/qs8-igemm-minmax-fp32.yaml b/test/qs8-igemm-minmax-fp32.yaml
index 8403032..c5cd1a0 100644
--- a/test/qs8-igemm-minmax-fp32.yaml
+++ b/test/qs8-igemm-minmax-fp32.yaml
@@ -144,6 +144,12 @@
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 8
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+  init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+  k-block: 8
 - name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
   init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
   k-block: 16
diff --git a/test/qs8-igemm-minmax-rndnu.cc b/test/qs8-igemm-minmax-rndnu.cc
index 7a10c55..5738241 100644
--- a/test/qs8-igemm-minmax-rndnu.cc
+++ b/test/qs8-igemm-minmax-rndnu.cc
@@ -72563,6 +72563,942 @@
 
 
 #if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cn_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      for (uint32_t n = 1; n <= 16; n++) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(m)
+          .n(n)
+          .k(8)
+          .iterations(1)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t m = 1; m <= 4; m++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(m)
+        .n(16)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 1; n <= 16; n++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(n)
+        .k(8)
+        .iterations(1)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k < 8; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 9; k < 16; k++) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 16; k <= 80; k += 8) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(n)
+          .k(k)
+          .cn_stride(19)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        for (uint32_t m = 1; m <= 4; m++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .ks(3)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 17; n < 32; n++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_small_kernel) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t n = 32; n <= 48; n += 16) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      for (uint32_t m = 1; m <= 4; m++) {
+        for (uint32_t n = 1; n <= 16; n++) {
+          GemmMicrokernelTester()
+            .mr(4)
+            .nr(16)
+            .kr(1)
+            .sr(1)
+            .m(m)
+            .n(n)
+            .k(k)
+            .cm_stride(19)
+            .iterations(1)
+            .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+        }
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, a_offset) {
+    TEST_REQUIRES_ARM_NEON;
+    for (size_t k = 1; k <= 40; k += 9) {
+      GemmMicrokernelTester()
+        .mr(4)
+        .nr(16)
+        .kr(1)
+        .sr(1)
+        .m(4)
+        .n(16)
+        .k(k)
+        .ks(3)
+        .a_offset(163)
+        .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, zero) {
+    TEST_REQUIRES_ARM_NEON;
+    for (uint32_t mz = 0; mz < 4; mz++) {
+      for (size_t k = 1; k <= 40; k += 9) {
+        GemmMicrokernelTester()
+          .mr(4)
+          .nr(16)
+          .kr(1)
+          .sr(1)
+          .m(4)
+          .n(16)
+          .k(k)
+          .ks(3)
+          .a_offset(163)
+          .zero_index(mz)
+          .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+      }
+    }
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmin(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .qmax(128)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+
+  TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+    TEST_REQUIRES_ARM_NEON;
+    GemmMicrokernelTester()
+      .mr(4)
+      .nr(16)
+      .kr(1)
+      .sr(1)
+      .m(4)
+      .n(16)
+      .k(8)
+      .cm_stride(19)
+      .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+  }
+#endif  // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
   TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
     TEST_REQUIRES_ARM_NEON_DOT;
     GemmMicrokernelTester()
diff --git a/test/qs8-igemm-minmax-rndnu.yaml b/test/qs8-igemm-minmax-rndnu.yaml
index 5d426f9..10eaa39 100644
--- a/test/qs8-igemm-minmax-rndnu.yaml
+++ b/test/qs8-igemm-minmax-rndnu.yaml
@@ -468,6 +468,12 @@
 - name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 8
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+  init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+  k-block: 8
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+  init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+  k-block: 8
 - name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55
   init: xnn_init_qs8_conv_minmax_rndnu_neon_params
   k-block: 16