4x16 lane AArch64 NEON GEMM/IGEMM ld64 microkernel
PiperOrigin-RevId: 411642422
diff --git a/BUILD.bazel b/BUILD.bazel
index 9e3cfcc..e9cb45d 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -6199,6 +6199,18 @@
]
AARCH64_ASM_MICROKERNEL_SRCS = [
+ "src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
+ "src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S",
+ "src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
+ "src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S",
+ "src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
+ "src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S",
+ "src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
+ "src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
+ "src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
+ "src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S",
+ "src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
+ "src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
"src/f16-gemm/gen-inc/1x8inc-minmax-aarch64-neonfp16arith-ld64.S",
"src/f16-gemm/gen-inc/1x16inc-minmax-aarch64-neonfp16arith-ld32.S",
"src/f16-gemm/gen-inc/4x8inc-minmax-aarch64-neonfp16arith-ld64.S",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ce1cde9..6bdb274 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5186,6 +5186,18 @@
src/f32-igemm/gen/4x8-minmax-aarch32-neon-pld-cortex-a75.S)
SET(AARCH64_ASM_MICROKERNEL_SRCS
+ src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+ src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+ src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+ src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+ src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+ src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+ src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+ src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
+ src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+ src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+ src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+ src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
src/f16-gemm/gen-inc/1x8inc-minmax-aarch64-neonfp16arith-ld64.S
src/f16-gemm/gen-inc/1x16inc-minmax-aarch64-neonfp16arith-ld32.S
src/f16-gemm/gen-inc/4x8inc-minmax-aarch64-neonfp16arith-ld64.S
diff --git a/bench/qs8-gemm-e2e.cc b/bench/qs8-gemm-e2e.cc
index a39dc8a..4af2454 100644
--- a/bench/qs8-gemm-e2e.cc
+++ b/bench/qs8-gemm-e2e.cc
@@ -136,6 +136,26 @@
4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
benchmark::utils::CheckNEON);
}
+ static void qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 4 /* mr */, 16 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
static void qs8_gemm_2x8c8__aarch64_neon_mlal(benchmark::State& state, models::ExecutionPlanFactory model) {
GEMMEnd2EndBenchmark(state, model,
xnn_qs8_gemm_minmax_rndnu_ukernel_2x8c8__aarch64_neon_mlal,
@@ -183,6 +203,8 @@
BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld128)
BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
+ BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+ BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_cortex_a53)
BENCHMARK_QS8_END2END(qs8_gemm_2x8c8__aarch64_neon_mlal_prfm)
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 4630cc0..d98b51f 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -244,6 +244,14 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, 4, 16, 1, 1,
xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
}
+ static void qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, 4, 16, 1, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, 4, 16, 1, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
static void qs8_gemm_1x8c8__aarch64_neon_mlal_prfm(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c8__aarch64_neon_mlal_prfm, 1, 8, 8, 1,
xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
@@ -293,6 +301,8 @@
BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+ BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
+ BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_prfm)
BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal)
BENCHMARK_GEMM(qs8_gemm_1x8c8__aarch64_neon_mlal_prfm_cortex_a53)
diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh
index 3be3587..973bdff 100755
--- a/scripts/generate-qs8-gemm.sh
+++ b/scripts/generate-qs8-gemm.sh
@@ -607,10 +607,7 @@
tools/xngen src/qu8-gemm/c4-neondot.c.in -D MR=3 -D NR=32 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -o src/qu8-gemm/gen/3x32c4-minmax-rndnu-neondot.c &
############################### AArch64 assembly ##############################
-# Cortex A53 micro-kernel
-tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S &
-tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
-
+### Cortex-A53 lane micro-kernels
tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
@@ -620,7 +617,19 @@
tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=FP32 -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S &
tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=FP32 -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
-# QU8 micro-kernels
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S &
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
+
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S &
+
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=0 -D REQUANTIZATION=FP32 -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=1 -D REQUANTIZATION=FP32 -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S &
+
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=0 -D REQUANTIZATION=FP32 -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=1 -D REQUANTIZATION=FP32 -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S &
+
+### QU8 micro-kernels
tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QU8 -o src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QU8 -o src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh
index d018881..6355a06 100755
--- a/scripts/generate-qs8-igemm.sh
+++ b/scripts/generate-qs8-igemm.sh
@@ -613,10 +613,7 @@
tools/xngen src/qu8-igemm/c4-neondot.c.in -D MR=3 -D NR=32 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -o src/qu8-igemm/gen/3x32c4-minmax-rndnu-neondot.c &
############################### AArch64 assembly ##############################
-# Cortex A53 micro-kernel
-tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S &
-tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
-
+### Cortex-A53 lane micro-kernels
tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
@@ -626,13 +623,26 @@
tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=FP32 -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S &
tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=FP32 -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
-# QU8 micro-kernels
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S &
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
+
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S &
+
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=0 -D REQUANTIZATION=FP32 -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=1 -D REQUANTIZATION=FP32 -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S &
+
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=0 -D REQUANTIZATION=FP32 -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=1 -D REQUANTIZATION=FP32 -D CHANNELWISE=1 -D DATATYPE=QC8 -o src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S &
+
+### QU8 micro-kernels
tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QU8 -o src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QU8 -o src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a75.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QU8 -o src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a75.S &
tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a75.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QU8 -o src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S &
+
### C4 micro-kernels
tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S &
tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in -D REQUANTIZATION=GEMMLOWP -D CHANNELWISE=0 -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S &
diff --git a/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..f9848c0
--- /dev/null
+++ b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,590 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x8 v17 v21 v25 v29
+# C2 x9 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x12, x11, [sp] // Load cn_stride, params
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ SUBS x0, x2, 8 // k = kc - 8
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ # Is there at least 8 bytes for main loop?
+ B.LO 3f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+1:
+ LD1 {v0.8b}, [x3], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], 8
+ LD1 {v2.8b}, [x13], 8
+ LD1 {v3.8b}, [x4], 8
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 1b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 3f
+
+2:
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ # Load per channel scale values from weights
+ LDR q4, [x5], 16
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ LDR q5, [x5], 16
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ LDR q6, [x5], 16
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v5.4s
+ LDR q4, [x5], 16
+ FMUL v21.4s, v21.4s, v5.4s
+ FMUL v22.4s, v22.4s, v5.4s
+ FMUL v23.4s, v23.4s, v5.4s
+ FMUL v24.4s, v24.4s, v6.4s
+ FMUL v25.4s, v25.4s, v6.4s
+ FMUL v26.4s, v26.4s, v6.4s
+ FMUL v27.4s, v27.4s, v6.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTN v0.8b, v16.8h
+ SQXTN v1.8b, v17.8h
+ SQXTN v2.8b, v18.8h
+ SQXTN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTN2 v0.16b, v24.8h
+ SQXTN2 v1.16b, v25.8h
+ SQXTN2 v2.16b, v26.8h
+ SQXTN2 v3.16b, v27.8h
+ SUB x11, x11, 3 // rewind params pointer
+
+ SMAX v0.16b, v0.16b, v4.16b
+ SMAX v1.16b, v1.16b, v4.16b
+ SMAX v2.16b, v2.16b, v4.16b
+ SMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ SMIN v0.16b, v0.16b, v5.16b
+ SMIN v1.16b, v1.16b, v5.16b
+ SMIN v2.16b, v2.16b, v5.16b
+ SMIN v3.16b, v3.16b, v5.16b
+ B.LO 4f
+
+ # Store full 4 x 16
+ ST1 {v0.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v1.16b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v2.16b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v3.16b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+3:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x3], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], x0
+ LD1 {v2.8b}, [x13], x0
+ LD1 {v3.8b}, [x4], x0
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+4:
+ TBZ x1, 3, 5f
+ STR d0, [x6], 8
+ STR d1, [x8], 8
+ DUP d0, v0.d[1]
+ DUP d1, v1.d[1]
+ STR d2, [x9], 8
+ STR d3, [x7], 8
+ DUP d2, v2.d[1]
+ DUP d3, v3.d[1]
+5:
+ TBZ x1, 2, 6f
+ STR s0, [x6], 4
+ STR s1, [x8], 4
+ DUP s0, v0.s[1]
+ DUP s1, v1.s[1]
+ STR s2, [x9], 4
+ STR s3, [x7], 4
+ DUP s2, v2.s[1]
+ DUP s3, v3.s[1]
+6:
+ TBZ x1, 1, 7f
+ STR h0, [x6], 2
+ STR h1, [x8], 2
+ DUP h0, v0.h[1]
+ DUP h1, v1.h[1]
+ STR h2, [x9], 2
+ STR h3, [x7], 2
+ DUP h2, v2.h[1]
+ DUP h3, v3.h[1]
+7:
+ TBZ x1, 0, 8f
+ STR b0, [x6]
+ STR b1, [x8]
+ STR b2, [x9]
+ STR b3, [x7]
+8:
+ RET
+
+END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..1c45866
--- /dev/null
+++ b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,596 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x8 v17 v21 v25 v29
+# C2 x9 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x12, x11, [sp] // Load cn_stride, params
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ SUBS x0, x2, 8 // k = kc - 8
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ # Is there at least 8 bytes for main loop?
+ B.LO 3f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+1:
+ LD1 {v0.8b}, [x3], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], 8
+ LD1 {v2.8b}, [x13], 8
+ LD1 {v3.8b}, [x4], 8
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x13, 128]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x15, 128]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x3, 128]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x4, 128]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 448]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 512]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 1b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 3f
+
+2:
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ # Load per channel scale values from weights
+ LDR q4, [x5], 16
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ LDR q5, [x5], 16
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ LDR q6, [x5], 16
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v5.4s
+ LDR q4, [x5], 16
+ FMUL v21.4s, v21.4s, v5.4s
+ FMUL v22.4s, v22.4s, v5.4s
+ FMUL v23.4s, v23.4s, v5.4s
+ FMUL v24.4s, v24.4s, v6.4s
+ FMUL v25.4s, v25.4s, v6.4s
+ FMUL v26.4s, v26.4s, v6.4s
+ FMUL v27.4s, v27.4s, v6.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTN v0.8b, v16.8h
+ SQXTN v1.8b, v17.8h
+ SQXTN v2.8b, v18.8h
+ SQXTN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTN2 v0.16b, v24.8h
+ SQXTN2 v1.16b, v25.8h
+ SQXTN2 v2.16b, v26.8h
+ SQXTN2 v3.16b, v27.8h
+ SUB x11, x11, 3 // rewind params pointer
+
+ SMAX v0.16b, v0.16b, v4.16b
+ SMAX v1.16b, v1.16b, v4.16b
+ SMAX v2.16b, v2.16b, v4.16b
+ SMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ SMIN v0.16b, v0.16b, v5.16b
+ SMIN v1.16b, v1.16b, v5.16b
+ SMIN v2.16b, v2.16b, v5.16b
+ SMIN v3.16b, v3.16b, v5.16b
+ B.LO 4f
+
+ # Store full 4 x 16
+ ST1 {v0.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v1.16b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v2.16b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v3.16b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+3:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x3], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], x0
+ LD1 {v2.8b}, [x13], x0
+ LD1 {v3.8b}, [x4], x0
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+4:
+ TBZ x1, 3, 5f
+ STR d0, [x6], 8
+ STR d1, [x8], 8
+ DUP d0, v0.d[1]
+ DUP d1, v1.d[1]
+ STR d2, [x9], 8
+ STR d3, [x7], 8
+ DUP d2, v2.d[1]
+ DUP d3, v3.d[1]
+5:
+ TBZ x1, 2, 6f
+ STR s0, [x6], 4
+ STR s1, [x8], 4
+ DUP s0, v0.s[1]
+ DUP s1, v1.s[1]
+ STR s2, [x9], 4
+ STR s3, [x7], 4
+ DUP s2, v2.s[1]
+ DUP s3, v3.s[1]
+6:
+ TBZ x1, 1, 7f
+ STR h0, [x6], 2
+ STR h1, [x8], 2
+ DUP h0, v0.h[1]
+ DUP h1, v1.h[1]
+ STR h2, [x9], 2
+ STR h3, [x7], 2
+ DUP h2, v2.h[1]
+ DUP h3, v3.h[1]
+7:
+ TBZ x1, 0, 8f
+ STR b0, [x6]
+ STR b1, [x8]
+ STR b2, [x9]
+ STR b3, [x7]
+8:
+ RET
+
+END_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..ea14f76
--- /dev/null
+++ b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,618 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t** restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x10
+# size_t a_offset, [sp + 8] -> x8
+# const int8_t* zero, [sp + 16] -> x12
+# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x20 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x16 v17 v21 v25 v29
+# C2 x17 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x10, x8, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ LDP x12, x11, [sp, 16] // Load zero, params pointer
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x20, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x8 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x8 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x8 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x20, x12 // if a3 == zero
+ ADD x20, x20, x8 // a3 += a_offset
+ CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 8 bytes for main loop?
+ SUBS x0, x2, 8 // k = kc - 8
+ B.LO 4f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+2:
+ LD1 {v0.8b}, [x13], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], 8
+ LD1 {v2.8b}, [x15], 8
+ LD1 {v3.8b}, [x20], 8
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 2b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 4f
+
+3:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ # Load per channel scale values from weights
+ LDR q4, [x5], 16
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ LDR q5, [x5], 16
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ LDR q6, [x5], 16
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v5.4s
+ LDR q4, [x5], 16
+ FMUL v21.4s, v21.4s, v5.4s
+ FMUL v22.4s, v22.4s, v5.4s
+ FMUL v23.4s, v23.4s, v5.4s
+ FMUL v24.4s, v24.4s, v6.4s
+ FMUL v25.4s, v25.4s, v6.4s
+ FMUL v26.4s, v26.4s, v6.4s
+ FMUL v27.4s, v27.4s, v6.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTN v0.8b, v16.8h
+ SQXTN v1.8b, v17.8h
+ SQXTN v2.8b, v18.8h
+ SQXTN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTN2 v0.16b, v24.8h
+ SQXTN2 v1.16b, v25.8h
+ SQXTN2 v2.16b, v26.8h
+ SQXTN2 v3.16b, v27.8h
+ SUB x11, x11, 3 // rewind params pointer
+
+ SMAX v0.16b, v0.16b, v4.16b
+ SMAX v1.16b, v1.16b, v4.16b
+ SMAX v2.16b, v2.16b, v4.16b
+ SMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ SMIN v0.16b, v0.16b, v5.16b
+ SMIN v1.16b, v1.16b, v5.16b
+ SMIN v2.16b, v2.16b, v5.16b
+ SMIN v3.16b, v3.16b, v5.16b
+ B.LO 5f
+
+ # Store full 4 x 16
+ ST1 {v3.16b}, [x7], x10
+ ST1 {v2.16b}, [x17], x10
+ ST1 {v1.16b}, [x16], x10
+ ST1 {v0.16b}, [x6], x10
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+4:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x13], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], x0
+ LD1 {v2.8b}, [x15], x0
+ LD1 {v3.8b}, [x20], x0
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+5:
+ TBZ x1, 3, 6f
+ STR d3, [x7], 8
+ STR d2, [x17], 8
+ DUP d3, v3.d[1]
+ DUP d2, v2.d[1]
+ STR d1, [x16], 8
+ STR d0, [x6], 8
+ DUP d1, v1.d[1]
+ DUP d0, v0.d[1]
+6:
+ TBZ x1, 2, 7f
+ STR s3, [x7], 4
+ STR s2, [x17], 4
+ DUP s3, v3.s[1]
+ DUP s2, v2.s[1]
+ STR s1, [x16], 4
+ STR s0, [x6], 4
+ DUP s1, v1.s[1]
+ DUP s0, v0.s[1]
+7:
+ TBZ x1, 1, 8f
+ STR h3, [x7], 2
+ STR h2, [x17], 2
+ DUP h3, v3.h[1]
+ DUP h2, v2.h[1]
+ STR h1, [x16], 2
+ STR h0, [x6], 2
+ DUP h1, v1.h[1]
+ DUP h0, v0.h[1]
+8:
+ TBZ x1, 0, 9f
+ STR b3, [x7]
+ STR b2, [x17]
+ STR b1, [x16]
+ STR b0, [x6]
+9:
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..908e363
--- /dev/null
+++ b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,624 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t** restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x10
+# size_t a_offset, [sp + 8] -> x8
+# const int8_t* zero, [sp + 16] -> x12
+# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x20 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x16 v17 v21 v25 v29
+# C2 x17 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x10, x8, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ LDP x12, x11, [sp, 16] // Load zero, params pointer
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x20, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x8 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x8 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x8 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x20, x12 // if a3 == zero
+ ADD x20, x20, x8 // a3 += a_offset
+ CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 8 bytes for main loop?
+ SUBS x0, x2, 8 // k = kc - 8
+ B.LO 4f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+2:
+ LD1 {v0.8b}, [x13], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], 8
+ LD1 {v2.8b}, [x15], 8
+ LD1 {v3.8b}, [x20], 8
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x13, 128]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x14, 128]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x15, 128]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x20, 128]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 448]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 512]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 2b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 4f
+
+3:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ # Load per channel scale values from weights
+ LDR q4, [x5], 16
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ LDR q5, [x5], 16
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ LDR q6, [x5], 16
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v5.4s
+ LDR q4, [x5], 16
+ FMUL v21.4s, v21.4s, v5.4s
+ FMUL v22.4s, v22.4s, v5.4s
+ FMUL v23.4s, v23.4s, v5.4s
+ FMUL v24.4s, v24.4s, v6.4s
+ FMUL v25.4s, v25.4s, v6.4s
+ FMUL v26.4s, v26.4s, v6.4s
+ FMUL v27.4s, v27.4s, v6.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTN v0.8b, v16.8h
+ SQXTN v1.8b, v17.8h
+ SQXTN v2.8b, v18.8h
+ SQXTN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTN2 v0.16b, v24.8h
+ SQXTN2 v1.16b, v25.8h
+ SQXTN2 v2.16b, v26.8h
+ SQXTN2 v3.16b, v27.8h
+ SUB x11, x11, 3 // rewind params pointer
+
+ SMAX v0.16b, v0.16b, v4.16b
+ SMAX v1.16b, v1.16b, v4.16b
+ SMAX v2.16b, v2.16b, v4.16b
+ SMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ SMIN v0.16b, v0.16b, v5.16b
+ SMIN v1.16b, v1.16b, v5.16b
+ SMIN v2.16b, v2.16b, v5.16b
+ SMIN v3.16b, v3.16b, v5.16b
+ B.LO 5f
+
+ # Store full 4 x 16
+ ST1 {v3.16b}, [x7], x10
+ ST1 {v2.16b}, [x17], x10
+ ST1 {v1.16b}, [x16], x10
+ ST1 {v0.16b}, [x6], x10
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+4:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x13], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], x0
+ LD1 {v2.8b}, [x15], x0
+ LD1 {v3.8b}, [x20], x0
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+5:
+ TBZ x1, 3, 6f
+ STR d3, [x7], 8
+ STR d2, [x17], 8
+ DUP d3, v3.d[1]
+ DUP d2, v2.d[1]
+ STR d1, [x16], 8
+ STR d0, [x6], 8
+ DUP d1, v1.d[1]
+ DUP d0, v0.d[1]
+6:
+ TBZ x1, 2, 7f
+ STR s3, [x7], 4
+ STR s2, [x17], 4
+ DUP s3, v3.s[1]
+ DUP s2, v2.s[1]
+ STR s1, [x16], 4
+ STR s0, [x6], 4
+ DUP s1, v1.s[1]
+ DUP s0, v0.s[1]
+7:
+ TBZ x1, 1, 8f
+ STR h3, [x7], 2
+ STR h2, [x17], 2
+ DUP h3, v3.h[1]
+ DUP h2, v2.h[1]
+ STR h1, [x16], 2
+ STR h0, [x6], 2
+ DUP h1, v1.h[1]
+ DUP h0, v0.h[1]
+8:
+ TBZ x1, 0, 9f
+ STR b3, [x7]
+ STR b2, [x17]
+ STR b1, [x16]
+ STR b0, [x6]
+9:
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in b/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
new file mode 100644
index 0000000..6d31b42
--- /dev/null
+++ b/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
@@ -0,0 +1,881 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert REQUANTIZATION in ["FP32", "GEMMLOWP", "RNDNU"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
+$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
+$assert DATATYPE != "QU8" or REQUANTIZATION == "RNDNU"
+
+#include <xnnpack/assembly.h>
+
+$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+$if DATATYPE == "QU8":
+ $REWIND_DECREMENT = 15
+$else:
+ $REWIND_DECREMENT = 3 if CHANNELWISE else {"GEMMLOWP": 11, "RNDNU": 15, "FP32": 7}[REQUANTIZATION]
+$XMIN = "UMIN" if DATATYPE == "QU8" else "SMIN"
+$XMAX = "UMAX" if DATATYPE == "QU8" else "SMAX"
+$XXTL = "UXTL" if DATATYPE == "QU8" else "SXTL"
+$SQXTXN = "SQXTUN" if DATATYPE == "QU8" else "SQXTN"
+$SQXTXN2 = "SQXTUN2" if DATATYPE == "QU8" else "SQXTN2"
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
+# void xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const ${XINT8_T}* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# ${XINT8_T}* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+
+$if REQUANTIZATION == "RNDNU" and DATATYPE == "QU8":
+ # params structure is 20 bytes
+ # struct {
+ # ${XINT8_T} kernel_zero_point[4];
+ # int32_t right_pre_shift;
+ # int32_t multiplier;
+ # int32_t right_post_shift;
+ # int16_t output_zero_point;
+ # ${XINT8_T} output_min;
+ # ${XINT8_T} output_max;
+ # } rndnu_neon;
+ #
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x8 v17 v21 v25 v29
+# C2 x9 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+$if DATATYPE == "QU8":
+ # zero_point v7
+ # unused v8 v9 v10 v11 v12 v13 v14 v15
+$else:
+ # unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x12, x11, [sp] // Load cn_stride, params
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+ $if DATATYPE == "QU8":
+ LD1R {v7.4s}, [x11], 4 // kernel_zero_point
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ SUBS x0, x2, 8 // k = kc - 8
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ # Is there at least 8 bytes for main loop?
+ B.LO 3f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+1:
+ LD1 {v0.8b}, [x3], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], 8
+ LD1 {v2.8b}, [x13], 8
+ LD1 {v3.8b}, [x4], 8
+ ${XXTL} v0.8h, v0.8b
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ ${XXTL} v1.8h, v1.8b
+ ${XXTL} v2.8h, v2.8b
+ ${XXTL} v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x13, 128]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x15, 128]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x3, 128]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x4, 128]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 448]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 512]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 1b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 3f
+
+2:
+ $if REQUANTIZATION == "GEMMLOWP":
+ # Apply params - scale, shift, bias and clamp
+ LD2R {v4.4s, v5.4s}, [x11], 8
+ CMEQ v6.4s, v5.4s, 0
+
+ BIC v0.16b, v16.16b, v6.16b
+ BIC v1.16b, v17.16b, v6.16b
+ BIC v2.16b, v18.16b, v6.16b
+ BIC v3.16b, v19.16b, v6.16b
+
+ SQRDMULH v16.4s, v16.4s, v4.4s
+ SQRDMULH v17.4s, v17.4s, v4.4s
+ SQRDMULH v18.4s, v18.4s, v4.4s
+ SQRDMULH v19.4s, v19.4s, v4.4s
+
+ SSRA v16.4s, v0.4s, 31 // signed shift right accumulate
+ SSRA v17.4s, v1.4s, 31
+ SSRA v18.4s, v2.4s, 31
+ SSRA v19.4s, v3.4s, 31
+
+ BIC v0.16b, v20.16b, v6.16b
+ BIC v1.16b, v21.16b, v6.16b
+ BIC v2.16b, v22.16b, v6.16b
+ BIC v3.16b, v23.16b, v6.16b
+
+ SQRDMULH v20.4s, v20.4s, v4.4s
+ SQRDMULH v21.4s, v21.4s, v4.4s
+ SQRDMULH v22.4s, v22.4s, v4.4s
+ SQRDMULH v23.4s, v23.4s, v4.4s
+
+ SSRA v20.4s, v0.4s, 31
+ SSRA v21.4s, v1.4s, 31
+ SSRA v22.4s, v2.4s, 31
+ SSRA v23.4s, v3.4s, 31
+
+ BIC v0.16b, v24.16b, v6.16b
+ BIC v1.16b, v25.16b, v6.16b
+ BIC v2.16b, v26.16b, v6.16b
+ BIC v3.16b, v27.16b, v6.16b
+
+ SQRDMULH v24.4s, v24.4s, v4.4s
+ SQRDMULH v25.4s, v25.4s, v4.4s
+ SQRDMULH v26.4s, v26.4s, v4.4s
+ SQRDMULH v27.4s, v27.4s, v4.4s
+
+ SSRA v24.4s, v0.4s, 31
+ SSRA v25.4s, v1.4s, 31
+ SSRA v26.4s, v2.4s, 31
+ SSRA v27.4s, v3.4s, 31
+
+ BIC v0.16b, v28.16b, v6.16b
+ BIC v1.16b, v29.16b, v6.16b
+ BIC v2.16b, v30.16b, v6.16b
+ BIC v3.16b, v31.16b, v6.16b
+
+ SQRDMULH v28.4s, v28.4s, v4.4s
+ SQRDMULH v29.4s, v29.4s, v4.4s
+ SQRDMULH v30.4s, v30.4s, v4.4s
+ SQRDMULH v31.4s, v31.4s, v4.4s
+
+ SSRA v28.4s, v0.4s, 31
+ SSRA v29.4s, v1.4s, 31
+ SSRA v30.4s, v2.4s, 31
+ SSRA v31.4s, v3.4s, 31
+
+ SRSHL v16.4s, v16.4s, v5.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v5.4s
+ SRSHL v18.4s, v18.4s, v5.4s
+ SRSHL v19.4s, v19.4s, v5.4s
+ SRSHL v20.4s, v20.4s, v5.4s
+ SRSHL v21.4s, v21.4s, v5.4s
+ SRSHL v22.4s, v22.4s, v5.4s
+ SRSHL v23.4s, v23.4s, v5.4s
+ SRSHL v24.4s, v24.4s, v5.4s
+ SRSHL v25.4s, v25.4s, v5.4s
+ SRSHL v26.4s, v26.4s, v5.4s
+ SRSHL v27.4s, v27.4s, v5.4s
+ SRSHL v28.4s, v28.4s, v5.4s
+ SRSHL v29.4s, v29.4s, v5.4s
+ SRSHL v30.4s, v30.4s, v5.4s
+ SRSHL v31.4s, v31.4s, v5.4s
+ $elif REQUANTIZATION == "RNDNU":
+ # Apply params - preshift, scale, postshift, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SSHL v16.4s, v16.4s, v4.4s // shift to upper bits
+ SSHL v17.4s, v17.4s, v4.4s
+ SSHL v18.4s, v18.4s, v4.4s
+ SSHL v19.4s, v19.4s, v4.4s
+ SSHL v20.4s, v20.4s, v4.4s
+ SSHL v21.4s, v21.4s, v4.4s
+ SSHL v22.4s, v22.4s, v4.4s
+ SSHL v23.4s, v23.4s, v4.4s
+ LD1R {v5.4s}, [x11], 4
+ SSHL v24.4s, v24.4s, v4.4s
+ SSHL v25.4s, v25.4s, v4.4s
+ SSHL v26.4s, v26.4s, v4.4s
+ SSHL v27.4s, v27.4s, v4.4s
+ SSHL v28.4s, v28.4s, v4.4s
+ SSHL v29.4s, v29.4s, v4.4s
+ SSHL v30.4s, v30.4s, v4.4s
+ SSHL v31.4s, v31.4s, v4.4s
+ LD1R {v6.4s}, [x11], 4
+ SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding
+ SQDMULH v17.4s, v17.4s, v5.4s
+ SQDMULH v18.4s, v18.4s, v5.4s
+ SQDMULH v19.4s, v19.4s, v5.4s
+ SQDMULH v20.4s, v20.4s, v5.4s
+ SQDMULH v21.4s, v21.4s, v5.4s
+ SQDMULH v22.4s, v22.4s, v5.4s
+ SQDMULH v23.4s, v23.4s, v5.4s
+ SQDMULH v24.4s, v24.4s, v5.4s
+ SQDMULH v25.4s, v25.4s, v5.4s
+ SQDMULH v26.4s, v26.4s, v5.4s
+ SQDMULH v27.4s, v27.4s, v5.4s
+ SQDMULH v28.4s, v28.4s, v5.4s
+ SQDMULH v29.4s, v29.4s, v5.4s
+ SQDMULH v30.4s, v30.4s, v5.4s
+ SQDMULH v31.4s, v31.4s, v5.4s
+ SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v6.4s
+ SRSHL v18.4s, v18.4s, v6.4s
+ SRSHL v19.4s, v19.4s, v6.4s
+ SRSHL v20.4s, v20.4s, v6.4s
+ SRSHL v21.4s, v21.4s, v6.4s
+ SRSHL v22.4s, v22.4s, v6.4s
+ SRSHL v23.4s, v23.4s, v6.4s
+ SRSHL v24.4s, v24.4s, v6.4s
+ SRSHL v25.4s, v25.4s, v6.4s
+ SRSHL v26.4s, v26.4s, v6.4s
+ SRSHL v27.4s, v27.4s, v6.4s
+ SRSHL v28.4s, v28.4s, v6.4s
+ SRSHL v29.4s, v29.4s, v6.4s
+ SRSHL v30.4s, v30.4s, v6.4s
+ SRSHL v31.4s, v31.4s, v6.4s
+ $elif REQUANTIZATION == "FP32":
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ $if not CHANNELWISE:
+ # Apply params - scale, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ $else:
+ # Load per channel scale values from weights
+ LDR q4, [x5], 16
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ LDR q5, [x5], 16
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ $if CHANNELWISE:
+ LDR q6, [x5], 16
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v5.4s
+ LDR q4, [x5], 16
+ FMUL v21.4s, v21.4s, v5.4s
+ FMUL v22.4s, v22.4s, v5.4s
+ FMUL v23.4s, v23.4s, v5.4s
+ FMUL v24.4s, v24.4s, v6.4s
+ FMUL v25.4s, v25.4s, v6.4s
+ FMUL v26.4s, v26.4s, v6.4s
+ FMUL v27.4s, v27.4s, v6.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+ $else:
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v4.4s
+ FMUL v21.4s, v21.4s, v4.4s
+ FMUL v22.4s, v22.4s, v4.4s
+ FMUL v23.4s, v23.4s, v4.4s
+ FMUL v24.4s, v24.4s, v4.4s
+ FMUL v25.4s, v25.4s, v4.4s
+ FMUL v26.4s, v26.4s, v4.4s
+ FMUL v27.4s, v27.4s, v4.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ ${SQXTXN} v0.8b, v16.8h
+ ${SQXTXN} v1.8b, v17.8h
+ ${SQXTXN} v2.8b, v18.8h
+ ${SQXTXN} v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ ${SQXTXN2} v0.16b, v24.8h
+ ${SQXTXN2} v1.16b, v25.8h
+ ${SQXTXN2} v2.16b, v26.8h
+ ${SQXTXN2} v3.16b, v27.8h
+ SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer
+
+ ${XMAX} v0.16b, v0.16b, v4.16b
+ ${XMAX} v1.16b, v1.16b, v4.16b
+ ${XMAX} v2.16b, v2.16b, v4.16b
+ ${XMAX} v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ ${XMIN} v0.16b, v0.16b, v5.16b
+ ${XMIN} v1.16b, v1.16b, v5.16b
+ ${XMIN} v2.16b, v2.16b, v5.16b
+ ${XMIN} v3.16b, v3.16b, v5.16b
+ B.LO 4f
+
+ # Store full 4 x 16
+ ST1 {v0.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v1.16b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v2.16b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v3.16b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+3:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x3], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], x0
+ LD1 {v2.8b}, [x13], x0
+ LD1 {v3.8b}, [x4], x0
+ ${XXTL} v0.8h, v0.8b
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ ${XXTL} v1.8h, v1.8b
+ ${XXTL} v2.8h, v2.8b
+ ${XXTL} v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+4:
+ TBZ x1, 3, 5f
+ STR d0, [x6], 8
+ STR d1, [x8], 8
+ DUP d0, v0.d[1]
+ DUP d1, v1.d[1]
+ STR d2, [x9], 8
+ STR d3, [x7], 8
+ DUP d2, v2.d[1]
+ DUP d3, v3.d[1]
+5:
+ TBZ x1, 2, 6f
+ STR s0, [x6], 4
+ STR s1, [x8], 4
+ DUP s0, v0.s[1]
+ DUP s1, v1.s[1]
+ STR s2, [x9], 4
+ STR s3, [x7], 4
+ DUP s2, v2.s[1]
+ DUP s3, v3.s[1]
+6:
+ TBZ x1, 1, 7f
+ STR h0, [x6], 2
+ STR h1, [x8], 2
+ DUP h0, v0.h[1]
+ DUP h1, v1.h[1]
+ STR h2, [x9], 2
+ STR h3, [x7], 2
+ DUP h2, v2.h[1]
+ DUP h3, v3.h[1]
+7:
+ TBZ x1, 0, 8f
+ STR b0, [x6]
+ STR b1, [x8]
+ STR b2, [x9]
+ STR b3, [x7]
+8:
+ RET
+
+END_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..d552539
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,587 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x8 v17 v21 v25 v29
+# C2 x9 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x12, x11, [sp] // Load cn_stride, params
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ SUBS x0, x2, 8 // k = kc - 8
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ # Is there at least 8 bytes for main loop?
+ B.LO 3f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+1:
+ LD1 {v0.8b}, [x3], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], 8
+ LD1 {v2.8b}, [x13], 8
+ LD1 {v3.8b}, [x4], 8
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 1b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 3f
+
+2:
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ # Apply params - scale, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v4.4s
+ FMUL v21.4s, v21.4s, v4.4s
+ FMUL v22.4s, v22.4s, v4.4s
+ FMUL v23.4s, v23.4s, v4.4s
+ FMUL v24.4s, v24.4s, v4.4s
+ FMUL v25.4s, v25.4s, v4.4s
+ FMUL v26.4s, v26.4s, v4.4s
+ FMUL v27.4s, v27.4s, v4.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTN v0.8b, v16.8h
+ SQXTN v1.8b, v17.8h
+ SQXTN v2.8b, v18.8h
+ SQXTN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTN2 v0.16b, v24.8h
+ SQXTN2 v1.16b, v25.8h
+ SQXTN2 v2.16b, v26.8h
+ SQXTN2 v3.16b, v27.8h
+ SUB x11, x11, 7 // rewind params pointer
+
+ SMAX v0.16b, v0.16b, v4.16b
+ SMAX v1.16b, v1.16b, v4.16b
+ SMAX v2.16b, v2.16b, v4.16b
+ SMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ SMIN v0.16b, v0.16b, v5.16b
+ SMIN v1.16b, v1.16b, v5.16b
+ SMIN v2.16b, v2.16b, v5.16b
+ SMIN v3.16b, v3.16b, v5.16b
+ B.LO 4f
+
+ # Store full 4 x 16
+ ST1 {v0.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v1.16b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v2.16b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v3.16b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+3:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x3], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], x0
+ LD1 {v2.8b}, [x13], x0
+ LD1 {v3.8b}, [x4], x0
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+4:
+ TBZ x1, 3, 5f
+ STR d0, [x6], 8
+ STR d1, [x8], 8
+ DUP d0, v0.d[1]
+ DUP d1, v1.d[1]
+ STR d2, [x9], 8
+ STR d3, [x7], 8
+ DUP d2, v2.d[1]
+ DUP d3, v3.d[1]
+5:
+ TBZ x1, 2, 6f
+ STR s0, [x6], 4
+ STR s1, [x8], 4
+ DUP s0, v0.s[1]
+ DUP s1, v1.s[1]
+ STR s2, [x9], 4
+ STR s3, [x7], 4
+ DUP s2, v2.s[1]
+ DUP s3, v3.s[1]
+6:
+ TBZ x1, 1, 7f
+ STR h0, [x6], 2
+ STR h1, [x8], 2
+ DUP h0, v0.h[1]
+ DUP h1, v1.h[1]
+ STR h2, [x9], 2
+ STR h3, [x7], 2
+ DUP h2, v2.h[1]
+ DUP h3, v3.h[1]
+7:
+ TBZ x1, 0, 8f
+ STR b0, [x6]
+ STR b1, [x8]
+ STR b2, [x9]
+ STR b3, [x7]
+8:
+ RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..6f95707
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,593 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x8 v17 v21 v25 v29
+# C2 x9 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x12, x11, [sp] // Load cn_stride, params
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ SUBS x0, x2, 8 // k = kc - 8
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ # Is there at least 8 bytes for main loop?
+ B.LO 3f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+1:
+ LD1 {v0.8b}, [x3], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], 8
+ LD1 {v2.8b}, [x13], 8
+ LD1 {v3.8b}, [x4], 8
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x13, 128]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x15, 128]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x3, 128]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x4, 128]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 448]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 512]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 1b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 3f
+
+2:
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ # Apply params - scale, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v4.4s
+ FMUL v21.4s, v21.4s, v4.4s
+ FMUL v22.4s, v22.4s, v4.4s
+ FMUL v23.4s, v23.4s, v4.4s
+ FMUL v24.4s, v24.4s, v4.4s
+ FMUL v25.4s, v25.4s, v4.4s
+ FMUL v26.4s, v26.4s, v4.4s
+ FMUL v27.4s, v27.4s, v4.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTN v0.8b, v16.8h
+ SQXTN v1.8b, v17.8h
+ SQXTN v2.8b, v18.8h
+ SQXTN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTN2 v0.16b, v24.8h
+ SQXTN2 v1.16b, v25.8h
+ SQXTN2 v2.16b, v26.8h
+ SQXTN2 v3.16b, v27.8h
+ SUB x11, x11, 7 // rewind params pointer
+
+ SMAX v0.16b, v0.16b, v4.16b
+ SMAX v1.16b, v1.16b, v4.16b
+ SMAX v2.16b, v2.16b, v4.16b
+ SMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ SMIN v0.16b, v0.16b, v5.16b
+ SMIN v1.16b, v1.16b, v5.16b
+ SMIN v2.16b, v2.16b, v5.16b
+ SMIN v3.16b, v3.16b, v5.16b
+ B.LO 4f
+
+ # Store full 4 x 16
+ ST1 {v0.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v1.16b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v2.16b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v3.16b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+3:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x3], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], x0
+ LD1 {v2.8b}, [x13], x0
+ LD1 {v3.8b}, [x4], x0
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+4:
+ TBZ x1, 3, 5f
+ STR d0, [x6], 8
+ STR d1, [x8], 8
+ DUP d0, v0.d[1]
+ DUP d1, v1.d[1]
+ STR d2, [x9], 8
+ STR d3, [x7], 8
+ DUP d2, v2.d[1]
+ DUP d3, v3.d[1]
+5:
+ TBZ x1, 2, 6f
+ STR s0, [x6], 4
+ STR s1, [x8], 4
+ DUP s0, v0.s[1]
+ DUP s1, v1.s[1]
+ STR s2, [x9], 4
+ STR s3, [x7], 4
+ DUP s2, v2.s[1]
+ DUP s3, v3.s[1]
+6:
+ TBZ x1, 1, 7f
+ STR h0, [x6], 2
+ STR h1, [x8], 2
+ DUP h0, v0.h[1]
+ DUP h1, v1.h[1]
+ STR h2, [x9], 2
+ STR h3, [x7], 2
+ DUP h2, v2.h[1]
+ DUP h3, v3.h[1]
+7:
+ TBZ x1, 0, 8f
+ STR b0, [x6]
+ STR b1, [x8]
+ STR b2, [x9]
+ STR b3, [x7]
+8:
+ RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..ebd7491
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,587 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x8 v17 v21 v25 v29
+# C2 x9 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x12, x11, [sp] // Load cn_stride, params
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ SUBS x0, x2, 8 // k = kc - 8
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ # Is there at least 8 bytes for main loop?
+ B.LO 3f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+1:
+ LD1 {v0.8b}, [x3], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], 8
+ LD1 {v2.8b}, [x13], 8
+ LD1 {v3.8b}, [x4], 8
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 1b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 3f
+
+2:
+ # Apply params - preshift, scale, postshift, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SSHL v16.4s, v16.4s, v4.4s // shift to upper bits
+ SSHL v17.4s, v17.4s, v4.4s
+ SSHL v18.4s, v18.4s, v4.4s
+ SSHL v19.4s, v19.4s, v4.4s
+ SSHL v20.4s, v20.4s, v4.4s
+ SSHL v21.4s, v21.4s, v4.4s
+ SSHL v22.4s, v22.4s, v4.4s
+ SSHL v23.4s, v23.4s, v4.4s
+ LD1R {v5.4s}, [x11], 4
+ SSHL v24.4s, v24.4s, v4.4s
+ SSHL v25.4s, v25.4s, v4.4s
+ SSHL v26.4s, v26.4s, v4.4s
+ SSHL v27.4s, v27.4s, v4.4s
+ SSHL v28.4s, v28.4s, v4.4s
+ SSHL v29.4s, v29.4s, v4.4s
+ SSHL v30.4s, v30.4s, v4.4s
+ SSHL v31.4s, v31.4s, v4.4s
+ LD1R {v6.4s}, [x11], 4
+ SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding
+ SQDMULH v17.4s, v17.4s, v5.4s
+ SQDMULH v18.4s, v18.4s, v5.4s
+ SQDMULH v19.4s, v19.4s, v5.4s
+ SQDMULH v20.4s, v20.4s, v5.4s
+ SQDMULH v21.4s, v21.4s, v5.4s
+ SQDMULH v22.4s, v22.4s, v5.4s
+ SQDMULH v23.4s, v23.4s, v5.4s
+ SQDMULH v24.4s, v24.4s, v5.4s
+ SQDMULH v25.4s, v25.4s, v5.4s
+ SQDMULH v26.4s, v26.4s, v5.4s
+ SQDMULH v27.4s, v27.4s, v5.4s
+ SQDMULH v28.4s, v28.4s, v5.4s
+ SQDMULH v29.4s, v29.4s, v5.4s
+ SQDMULH v30.4s, v30.4s, v5.4s
+ SQDMULH v31.4s, v31.4s, v5.4s
+ SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v6.4s
+ SRSHL v18.4s, v18.4s, v6.4s
+ SRSHL v19.4s, v19.4s, v6.4s
+ SRSHL v20.4s, v20.4s, v6.4s
+ SRSHL v21.4s, v21.4s, v6.4s
+ SRSHL v22.4s, v22.4s, v6.4s
+ SRSHL v23.4s, v23.4s, v6.4s
+ SRSHL v24.4s, v24.4s, v6.4s
+ SRSHL v25.4s, v25.4s, v6.4s
+ SRSHL v26.4s, v26.4s, v6.4s
+ SRSHL v27.4s, v27.4s, v6.4s
+ SRSHL v28.4s, v28.4s, v6.4s
+ SRSHL v29.4s, v29.4s, v6.4s
+ SRSHL v30.4s, v30.4s, v6.4s
+ SRSHL v31.4s, v31.4s, v6.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTN v0.8b, v16.8h
+ SQXTN v1.8b, v17.8h
+ SQXTN v2.8b, v18.8h
+ SQXTN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTN2 v0.16b, v24.8h
+ SQXTN2 v1.16b, v25.8h
+ SQXTN2 v2.16b, v26.8h
+ SQXTN2 v3.16b, v27.8h
+ SUB x11, x11, 15 // rewind params pointer
+
+ SMAX v0.16b, v0.16b, v4.16b
+ SMAX v1.16b, v1.16b, v4.16b
+ SMAX v2.16b, v2.16b, v4.16b
+ SMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ SMIN v0.16b, v0.16b, v5.16b
+ SMIN v1.16b, v1.16b, v5.16b
+ SMIN v2.16b, v2.16b, v5.16b
+ SMIN v3.16b, v3.16b, v5.16b
+ B.LO 4f
+
+ # Store full 4 x 16
+ ST1 {v0.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v1.16b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v2.16b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v3.16b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+3:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x3], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], x0
+ LD1 {v2.8b}, [x13], x0
+ LD1 {v3.8b}, [x4], x0
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+4:
+ TBZ x1, 3, 5f
+ STR d0, [x6], 8
+ STR d1, [x8], 8
+ DUP d0, v0.d[1]
+ DUP d1, v1.d[1]
+ STR d2, [x9], 8
+ STR d3, [x7], 8
+ DUP d2, v2.d[1]
+ DUP d3, v3.d[1]
+5:
+ TBZ x1, 2, 6f
+ STR s0, [x6], 4
+ STR s1, [x8], 4
+ DUP s0, v0.s[1]
+ DUP s1, v1.s[1]
+ STR s2, [x9], 4
+ STR s3, [x7], 4
+ DUP s2, v2.s[1]
+ DUP s3, v3.s[1]
+6:
+ TBZ x1, 1, 7f
+ STR h0, [x6], 2
+ STR h1, [x8], 2
+ DUP h0, v0.h[1]
+ DUP h1, v1.h[1]
+ STR h2, [x9], 2
+ STR h3, [x7], 2
+ DUP h2, v2.h[1]
+ DUP h3, v3.h[1]
+7:
+ TBZ x1, 0, 8f
+ STR b0, [x6]
+ STR b1, [x8]
+ STR b2, [x9]
+ STR b3, [x7]
+8:
+ RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..0836cbd
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,593 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x8 v17 v21 v25 v29
+# C2 x9 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+# x10 x17 a53 temp registers
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x12, x11, [sp] // Load cn_stride, params
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ SUBS x0, x2, 8 // k = kc - 8
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ # Is there at least 8 bytes for main loop?
+ B.LO 3f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+1:
+ LD1 {v0.8b}, [x3], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], 8
+ LD1 {v2.8b}, [x13], 8
+ LD1 {v3.8b}, [x4], 8
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x13, 128]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x15, 128]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x3, 128]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x4, 128]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 448]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 512]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 1b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 3f
+
+2:
+ # Apply params - preshift, scale, postshift, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SSHL v16.4s, v16.4s, v4.4s // shift to upper bits
+ SSHL v17.4s, v17.4s, v4.4s
+ SSHL v18.4s, v18.4s, v4.4s
+ SSHL v19.4s, v19.4s, v4.4s
+ SSHL v20.4s, v20.4s, v4.4s
+ SSHL v21.4s, v21.4s, v4.4s
+ SSHL v22.4s, v22.4s, v4.4s
+ SSHL v23.4s, v23.4s, v4.4s
+ LD1R {v5.4s}, [x11], 4
+ SSHL v24.4s, v24.4s, v4.4s
+ SSHL v25.4s, v25.4s, v4.4s
+ SSHL v26.4s, v26.4s, v4.4s
+ SSHL v27.4s, v27.4s, v4.4s
+ SSHL v28.4s, v28.4s, v4.4s
+ SSHL v29.4s, v29.4s, v4.4s
+ SSHL v30.4s, v30.4s, v4.4s
+ SSHL v31.4s, v31.4s, v4.4s
+ LD1R {v6.4s}, [x11], 4
+ SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding
+ SQDMULH v17.4s, v17.4s, v5.4s
+ SQDMULH v18.4s, v18.4s, v5.4s
+ SQDMULH v19.4s, v19.4s, v5.4s
+ SQDMULH v20.4s, v20.4s, v5.4s
+ SQDMULH v21.4s, v21.4s, v5.4s
+ SQDMULH v22.4s, v22.4s, v5.4s
+ SQDMULH v23.4s, v23.4s, v5.4s
+ SQDMULH v24.4s, v24.4s, v5.4s
+ SQDMULH v25.4s, v25.4s, v5.4s
+ SQDMULH v26.4s, v26.4s, v5.4s
+ SQDMULH v27.4s, v27.4s, v5.4s
+ SQDMULH v28.4s, v28.4s, v5.4s
+ SQDMULH v29.4s, v29.4s, v5.4s
+ SQDMULH v30.4s, v30.4s, v5.4s
+ SQDMULH v31.4s, v31.4s, v5.4s
+ SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v6.4s
+ SRSHL v18.4s, v18.4s, v6.4s
+ SRSHL v19.4s, v19.4s, v6.4s
+ SRSHL v20.4s, v20.4s, v6.4s
+ SRSHL v21.4s, v21.4s, v6.4s
+ SRSHL v22.4s, v22.4s, v6.4s
+ SRSHL v23.4s, v23.4s, v6.4s
+ SRSHL v24.4s, v24.4s, v6.4s
+ SRSHL v25.4s, v25.4s, v6.4s
+ SRSHL v26.4s, v26.4s, v6.4s
+ SRSHL v27.4s, v27.4s, v6.4s
+ SRSHL v28.4s, v28.4s, v6.4s
+ SRSHL v29.4s, v29.4s, v6.4s
+ SRSHL v30.4s, v30.4s, v6.4s
+ SRSHL v31.4s, v31.4s, v6.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTN v0.8b, v16.8h
+ SQXTN v1.8b, v17.8h
+ SQXTN v2.8b, v18.8h
+ SQXTN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTN2 v0.16b, v24.8h
+ SQXTN2 v1.16b, v25.8h
+ SQXTN2 v2.16b, v26.8h
+ SQXTN2 v3.16b, v27.8h
+ SUB x11, x11, 15 // rewind params pointer
+
+ SMAX v0.16b, v0.16b, v4.16b
+ SMAX v1.16b, v1.16b, v4.16b
+ SMAX v2.16b, v2.16b, v4.16b
+ SMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ SMIN v0.16b, v0.16b, v5.16b
+ SMIN v1.16b, v1.16b, v5.16b
+ SMIN v2.16b, v2.16b, v5.16b
+ SMIN v3.16b, v3.16b, v5.16b
+ B.LO 4f
+
+ # Store full 4 x 16
+ ST1 {v0.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v1.16b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v2.16b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v3.16b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+3:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x3], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x15], x0
+ LD1 {v2.8b}, [x13], x0
+ LD1 {v3.8b}, [x4], x0
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 2b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+4:
+ TBZ x1, 3, 5f
+ STR d0, [x6], 8
+ STR d1, [x8], 8
+ DUP d0, v0.d[1]
+ DUP d1, v1.d[1]
+ STR d2, [x9], 8
+ STR d3, [x7], 8
+ DUP d2, v2.d[1]
+ DUP d3, v3.d[1]
+5:
+ TBZ x1, 2, 6f
+ STR s0, [x6], 4
+ STR s1, [x8], 4
+ DUP s0, v0.s[1]
+ DUP s1, v1.s[1]
+ STR s2, [x9], 4
+ STR s3, [x7], 4
+ DUP s2, v2.s[1]
+ DUP s3, v3.s[1]
+6:
+ TBZ x1, 1, 7f
+ STR h0, [x6], 2
+ STR h1, [x8], 2
+ DUP h0, v0.h[1]
+ DUP h1, v1.h[1]
+ STR h2, [x9], 2
+ STR h3, [x7], 2
+ DUP h2, v2.h[1]
+ DUP h3, v3.h[1]
+7:
+ TBZ x1, 0, 8f
+ STR b0, [x6]
+ STR b1, [x8]
+ STR b2, [x9]
+ STR b3, [x7]
+8:
+ RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
new file mode 100644
index 0000000..9730957
--- /dev/null
+++ b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
@@ -0,0 +1,911 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert REQUANTIZATION in ["FP32", "GEMMLOWP", "RNDNU"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
+$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
+$assert DATATYPE != "QU8" or REQUANTIZATION == "RNDNU"
+
+#include <xnnpack/assembly.h>
+
+$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+$if DATATYPE == "QU8":
+ $REWIND_DECREMENT = 19
+$else:
+ $REWIND_DECREMENT = 3 if CHANNELWISE else {"GEMMLOWP": 11, "RNDNU": 15, "FP32": 7}[REQUANTIZATION]
+$XMIN = "UMIN" if DATATYPE == "QU8" else "SMIN"
+$XMAX = "UMAX" if DATATYPE == "QU8" else "SMAX"
+$XXTL = "UXTL" if DATATYPE == "QU8" else "SXTL"
+$SQXTXN = "SQXTUN" if DATATYPE == "QU8" else "SQXTN"
+$SQXTXN2 = "SQXTUN2" if DATATYPE == "QU8" else "SQXTN2"
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
+# void xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const ${XINT8_T}** restrict a, x4
+# const ${XINT8_T}* restrict w, x5
+# ${XINT8_T}* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x10
+# size_t a_offset, [sp + 8] -> x8
+# const ${XINT8_T}* zero, [sp + 16] -> x12
+# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+$if REQUANTIZATION == "RNDNU" and DATATYPE == "QU8":
+ # params structure is 20 bytes
+ # struct {
+ # ${XINT8_T} kernel_zero_point[4];
+ # int32_t right_pre_shift;
+ # int32_t multiplier;
+ # int32_t right_post_shift;
+ # int16_t output_zero_point;
+ # ${XINT8_T} output_min;
+ # ${XINT8_T} output_max;
+ # } rndnu_neon;
+ #
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x20 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x16 v17 v21 v25 v29
+# C2 x17 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+$if DATATYPE == "QU8":
+ # zero_point v7
+ # unused v8 v9 v10 v11 v12 v13 v14 v15
+$else:
+ # unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x10, x8, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ LDP x12, x11, [sp, 16] // Load zero, params pointer
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+ $if DATATYPE == "QU8":
+ LD1R {v7.4s}, [x11] // kernel_zero_point
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ $if DATATYPE == "QU8":
+ ADD x11, x11, 4 // adjust params pointer
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x20, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x8 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x8 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x8 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x20, x12 // if a3 == zero
+ ADD x20, x20, x8 // a3 += a_offset
+ CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 8 bytes for main loop?
+ SUBS x0, x2, 8 // k = kc - 8
+ B.LO 4f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+2:
+ LD1 {v0.8b}, [x13], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], 8
+ LD1 {v2.8b}, [x15], 8
+ LD1 {v3.8b}, [x20], 8
+ ${XXTL} v0.8h, v0.8b
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ ${XXTL} v1.8h, v1.8b
+ ${XXTL} v2.8h, v2.8b
+ ${XXTL} v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x13, 128]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x14, 128]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x15, 128]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x20, 128]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 448]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 512]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 2b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 4f
+
+3:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(${XINT8_T}*)
+ B.HI 1b
+
+ $if REQUANTIZATION == "GEMMLOWP":
+ # Apply params - scale, shift, bias and clamp
+ LD2R {v4.4s, v5.4s}, [x11], 8
+ CMEQ v6.4s, v5.4s, 0
+
+ BIC v0.16b, v16.16b, v6.16b
+ BIC v1.16b, v17.16b, v6.16b
+ BIC v2.16b, v18.16b, v6.16b
+ BIC v3.16b, v19.16b, v6.16b
+
+ SQRDMULH v16.4s, v16.4s, v4.4s
+ SQRDMULH v17.4s, v17.4s, v4.4s
+ SQRDMULH v18.4s, v18.4s, v4.4s
+ SQRDMULH v19.4s, v19.4s, v4.4s
+
+ SSRA v16.4s, v0.4s, 31 // signed shift right accumulate
+ SSRA v17.4s, v1.4s, 31
+ SSRA v18.4s, v2.4s, 31
+ SSRA v19.4s, v3.4s, 31
+
+ BIC v0.16b, v20.16b, v6.16b
+ BIC v1.16b, v21.16b, v6.16b
+ BIC v2.16b, v22.16b, v6.16b
+ BIC v3.16b, v23.16b, v6.16b
+
+ SQRDMULH v20.4s, v20.4s, v4.4s
+ SQRDMULH v21.4s, v21.4s, v4.4s
+ SQRDMULH v22.4s, v22.4s, v4.4s
+ SQRDMULH v23.4s, v23.4s, v4.4s
+
+ SSRA v20.4s, v0.4s, 31
+ SSRA v21.4s, v1.4s, 31
+ SSRA v22.4s, v2.4s, 31
+ SSRA v23.4s, v3.4s, 31
+
+ BIC v0.16b, v24.16b, v6.16b
+ BIC v1.16b, v25.16b, v6.16b
+ BIC v2.16b, v26.16b, v6.16b
+ BIC v3.16b, v27.16b, v6.16b
+
+ SQRDMULH v24.4s, v24.4s, v4.4s
+ SQRDMULH v25.4s, v25.4s, v4.4s
+ SQRDMULH v26.4s, v26.4s, v4.4s
+ SQRDMULH v27.4s, v27.4s, v4.4s
+
+ SSRA v24.4s, v0.4s, 31
+ SSRA v25.4s, v1.4s, 31
+ SSRA v26.4s, v2.4s, 31
+ SSRA v27.4s, v3.4s, 31
+
+ BIC v0.16b, v28.16b, v6.16b
+ BIC v1.16b, v29.16b, v6.16b
+ BIC v2.16b, v30.16b, v6.16b
+ BIC v3.16b, v31.16b, v6.16b
+
+ SQRDMULH v28.4s, v28.4s, v4.4s
+ SQRDMULH v29.4s, v29.4s, v4.4s
+ SQRDMULH v30.4s, v30.4s, v4.4s
+ SQRDMULH v31.4s, v31.4s, v4.4s
+
+ SSRA v28.4s, v0.4s, 31
+ SSRA v29.4s, v1.4s, 31
+ SSRA v30.4s, v2.4s, 31
+ SSRA v31.4s, v3.4s, 31
+
+ SRSHL v16.4s, v16.4s, v5.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v5.4s
+ SRSHL v18.4s, v18.4s, v5.4s
+ SRSHL v19.4s, v19.4s, v5.4s
+ SRSHL v20.4s, v20.4s, v5.4s
+ SRSHL v21.4s, v21.4s, v5.4s
+ SRSHL v22.4s, v22.4s, v5.4s
+ SRSHL v23.4s, v23.4s, v5.4s
+ SRSHL v24.4s, v24.4s, v5.4s
+ SRSHL v25.4s, v25.4s, v5.4s
+ SRSHL v26.4s, v26.4s, v5.4s
+ SRSHL v27.4s, v27.4s, v5.4s
+ SRSHL v28.4s, v28.4s, v5.4s
+ SRSHL v29.4s, v29.4s, v5.4s
+ SRSHL v30.4s, v30.4s, v5.4s
+ SRSHL v31.4s, v31.4s, v5.4s
+ $elif REQUANTIZATION == "RNDNU":
+ # Apply params - preshift, scale, postshift, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SSHL v16.4s, v16.4s, v4.4s // shift to upper bits
+ SSHL v17.4s, v17.4s, v4.4s
+ SSHL v18.4s, v18.4s, v4.4s
+ SSHL v19.4s, v19.4s, v4.4s
+ SSHL v20.4s, v20.4s, v4.4s
+ SSHL v21.4s, v21.4s, v4.4s
+ SSHL v22.4s, v22.4s, v4.4s
+ SSHL v23.4s, v23.4s, v4.4s
+ LD1R {v5.4s}, [x11], 4
+ SSHL v24.4s, v24.4s, v4.4s
+ SSHL v25.4s, v25.4s, v4.4s
+ SSHL v26.4s, v26.4s, v4.4s
+ SSHL v27.4s, v27.4s, v4.4s
+ SSHL v28.4s, v28.4s, v4.4s
+ SSHL v29.4s, v29.4s, v4.4s
+ SSHL v30.4s, v30.4s, v4.4s
+ SSHL v31.4s, v31.4s, v4.4s
+ LD1R {v6.4s}, [x11], 4
+ SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding
+ SQDMULH v17.4s, v17.4s, v5.4s
+ SQDMULH v18.4s, v18.4s, v5.4s
+ SQDMULH v19.4s, v19.4s, v5.4s
+ SQDMULH v20.4s, v20.4s, v5.4s
+ SQDMULH v21.4s, v21.4s, v5.4s
+ SQDMULH v22.4s, v22.4s, v5.4s
+ SQDMULH v23.4s, v23.4s, v5.4s
+ SQDMULH v24.4s, v24.4s, v5.4s
+ SQDMULH v25.4s, v25.4s, v5.4s
+ SQDMULH v26.4s, v26.4s, v5.4s
+ SQDMULH v27.4s, v27.4s, v5.4s
+ SQDMULH v28.4s, v28.4s, v5.4s
+ SQDMULH v29.4s, v29.4s, v5.4s
+ SQDMULH v30.4s, v30.4s, v5.4s
+ SQDMULH v31.4s, v31.4s, v5.4s
+ SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v6.4s
+ SRSHL v18.4s, v18.4s, v6.4s
+ SRSHL v19.4s, v19.4s, v6.4s
+ SRSHL v20.4s, v20.4s, v6.4s
+ SRSHL v21.4s, v21.4s, v6.4s
+ SRSHL v22.4s, v22.4s, v6.4s
+ SRSHL v23.4s, v23.4s, v6.4s
+ SRSHL v24.4s, v24.4s, v6.4s
+ SRSHL v25.4s, v25.4s, v6.4s
+ SRSHL v26.4s, v26.4s, v6.4s
+ SRSHL v27.4s, v27.4s, v6.4s
+ SRSHL v28.4s, v28.4s, v6.4s
+ SRSHL v29.4s, v29.4s, v6.4s
+ SRSHL v30.4s, v30.4s, v6.4s
+ SRSHL v31.4s, v31.4s, v6.4s
+ $elif REQUANTIZATION == "FP32":
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ $if not CHANNELWISE:
+ # Apply params - scale, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ $else:
+ # Load per channel scale values from weights
+ LDR q4, [x5], 16
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ LDR q5, [x5], 16
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ $if CHANNELWISE:
+ LDR q6, [x5], 16
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v5.4s
+ LDR q4, [x5], 16
+ FMUL v21.4s, v21.4s, v5.4s
+ FMUL v22.4s, v22.4s, v5.4s
+ FMUL v23.4s, v23.4s, v5.4s
+ FMUL v24.4s, v24.4s, v6.4s
+ FMUL v25.4s, v25.4s, v6.4s
+ FMUL v26.4s, v26.4s, v6.4s
+ FMUL v27.4s, v27.4s, v6.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+ $else:
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v4.4s
+ FMUL v21.4s, v21.4s, v4.4s
+ FMUL v22.4s, v22.4s, v4.4s
+ FMUL v23.4s, v23.4s, v4.4s
+ FMUL v24.4s, v24.4s, v4.4s
+ FMUL v25.4s, v25.4s, v4.4s
+ FMUL v26.4s, v26.4s, v4.4s
+ FMUL v27.4s, v27.4s, v4.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ ${SQXTXN} v0.8b, v16.8h
+ ${SQXTXN} v1.8b, v17.8h
+ ${SQXTXN} v2.8b, v18.8h
+ ${SQXTXN} v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ ${SQXTXN2} v0.16b, v24.8h
+ ${SQXTXN2} v1.16b, v25.8h
+ ${SQXTXN2} v2.16b, v26.8h
+ ${SQXTXN2} v3.16b, v27.8h
+ SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer
+
+ ${XMAX} v0.16b, v0.16b, v4.16b
+ ${XMAX} v1.16b, v1.16b, v4.16b
+ ${XMAX} v2.16b, v2.16b, v4.16b
+ ${XMAX} v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ ${XMIN} v0.16b, v0.16b, v5.16b
+ ${XMIN} v1.16b, v1.16b, v5.16b
+ ${XMIN} v2.16b, v2.16b, v5.16b
+ ${XMIN} v3.16b, v3.16b, v5.16b
+ B.LO 5f
+
+ # Store full 4 x 16
+ ST1 {v3.16b}, [x7], x10
+ ST1 {v2.16b}, [x17], x10
+ ST1 {v1.16b}, [x16], x10
+ ST1 {v0.16b}, [x6], x10
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+4:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x13], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], x0
+ LD1 {v2.8b}, [x15], x0
+ LD1 {v3.8b}, [x20], x0
+ ${XXTL} v0.8h, v0.8b
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ ${XXTL} v1.8h, v1.8b
+ ${XXTL} v2.8h, v2.8b
+ ${XXTL} v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ $if DATATYPE == "QU8":
+ USUBL v4.8h, v4.8b, v7.8b
+ $else:
+ SXTL v4.8h, v4.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+5:
+ TBZ x1, 3, 6f
+ STR d3, [x7], 8
+ STR d2, [x17], 8
+ DUP d3, v3.d[1]
+ DUP d2, v2.d[1]
+ STR d1, [x16], 8
+ STR d0, [x6], 8
+ DUP d1, v1.d[1]
+ DUP d0, v0.d[1]
+6:
+ TBZ x1, 2, 7f
+ STR s3, [x7], 4
+ STR s2, [x17], 4
+ DUP s3, v3.s[1]
+ DUP s2, v2.s[1]
+ STR s1, [x16], 4
+ STR s0, [x6], 4
+ DUP s1, v1.s[1]
+ DUP s0, v0.s[1]
+7:
+ TBZ x1, 1, 8f
+ STR h3, [x7], 2
+ STR h2, [x17], 2
+ DUP h3, v3.h[1]
+ DUP h2, v2.h[1]
+ STR h1, [x16], 2
+ STR h0, [x6], 2
+ DUP h1, v1.h[1]
+ DUP h0, v0.h[1]
+8:
+ TBZ x1, 0, 9f
+ STR b3, [x7]
+ STR b2, [x17]
+ STR b1, [x16]
+ STR b0, [x6]
+9:
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+END_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..42794c2
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,615 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t** restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x10
+# size_t a_offset, [sp + 8] -> x8
+# const int8_t* zero, [sp + 16] -> x12
+# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x20 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x16 v17 v21 v25 v29
+# C2 x17 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x10, x8, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ LDP x12, x11, [sp, 16] // Load zero, params pointer
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x20, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x8 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x8 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x8 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x20, x12 // if a3 == zero
+ ADD x20, x20, x8 // a3 += a_offset
+ CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 8 bytes for main loop?
+ SUBS x0, x2, 8 // k = kc - 8
+ B.LO 4f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+2:
+ LD1 {v0.8b}, [x13], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], 8
+ LD1 {v2.8b}, [x15], 8
+ LD1 {v3.8b}, [x20], 8
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 2b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 4f
+
+3:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ # Apply params - scale, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v4.4s
+ FMUL v21.4s, v21.4s, v4.4s
+ FMUL v22.4s, v22.4s, v4.4s
+ FMUL v23.4s, v23.4s, v4.4s
+ FMUL v24.4s, v24.4s, v4.4s
+ FMUL v25.4s, v25.4s, v4.4s
+ FMUL v26.4s, v26.4s, v4.4s
+ FMUL v27.4s, v27.4s, v4.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTN v0.8b, v16.8h
+ SQXTN v1.8b, v17.8h
+ SQXTN v2.8b, v18.8h
+ SQXTN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTN2 v0.16b, v24.8h
+ SQXTN2 v1.16b, v25.8h
+ SQXTN2 v2.16b, v26.8h
+ SQXTN2 v3.16b, v27.8h
+ SUB x11, x11, 7 // rewind params pointer
+
+ SMAX v0.16b, v0.16b, v4.16b
+ SMAX v1.16b, v1.16b, v4.16b
+ SMAX v2.16b, v2.16b, v4.16b
+ SMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ SMIN v0.16b, v0.16b, v5.16b
+ SMIN v1.16b, v1.16b, v5.16b
+ SMIN v2.16b, v2.16b, v5.16b
+ SMIN v3.16b, v3.16b, v5.16b
+ B.LO 5f
+
+ # Store full 4 x 16
+ ST1 {v3.16b}, [x7], x10
+ ST1 {v2.16b}, [x17], x10
+ ST1 {v1.16b}, [x16], x10
+ ST1 {v0.16b}, [x6], x10
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+4:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x13], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], x0
+ LD1 {v2.8b}, [x15], x0
+ LD1 {v3.8b}, [x20], x0
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+5:
+ TBZ x1, 3, 6f
+ STR d3, [x7], 8
+ STR d2, [x17], 8
+ DUP d3, v3.d[1]
+ DUP d2, v2.d[1]
+ STR d1, [x16], 8
+ STR d0, [x6], 8
+ DUP d1, v1.d[1]
+ DUP d0, v0.d[1]
+6:
+ TBZ x1, 2, 7f
+ STR s3, [x7], 4
+ STR s2, [x17], 4
+ DUP s3, v3.s[1]
+ DUP s2, v2.s[1]
+ STR s1, [x16], 4
+ STR s0, [x6], 4
+ DUP s1, v1.s[1]
+ DUP s0, v0.s[1]
+7:
+ TBZ x1, 1, 8f
+ STR h3, [x7], 2
+ STR h2, [x17], 2
+ DUP h3, v3.h[1]
+ DUP h2, v2.h[1]
+ STR h1, [x16], 2
+ STR h0, [x6], 2
+ DUP h1, v1.h[1]
+ DUP h0, v0.h[1]
+8:
+ TBZ x1, 0, 9f
+ STR b3, [x7]
+ STR b2, [x17]
+ STR b1, [x16]
+ STR b0, [x6]
+9:
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..b9cda9e
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,621 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t** restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x10
+# size_t a_offset, [sp + 8] -> x8
+# const int8_t* zero, [sp + 16] -> x12
+# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x20 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x16 v17 v21 v25 v29
+# C2 x17 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x10, x8, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ LDP x12, x11, [sp, 16] // Load zero, params pointer
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x20, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x8 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x8 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x8 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x20, x12 // if a3 == zero
+ ADD x20, x20, x8 // a3 += a_offset
+ CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 8 bytes for main loop?
+ SUBS x0, x2, 8 // k = kc - 8
+ B.LO 4f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+2:
+ LD1 {v0.8b}, [x13], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], 8
+ LD1 {v2.8b}, [x15], 8
+ LD1 {v3.8b}, [x20], 8
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x13, 128]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x14, 128]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x15, 128]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x20, 128]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 448]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 512]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 2b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 4f
+
+3:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ # Apply params - scale, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v4.4s
+ FMUL v21.4s, v21.4s, v4.4s
+ FMUL v22.4s, v22.4s, v4.4s
+ FMUL v23.4s, v23.4s, v4.4s
+ FMUL v24.4s, v24.4s, v4.4s
+ FMUL v25.4s, v25.4s, v4.4s
+ FMUL v26.4s, v26.4s, v4.4s
+ FMUL v27.4s, v27.4s, v4.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTN v0.8b, v16.8h
+ SQXTN v1.8b, v17.8h
+ SQXTN v2.8b, v18.8h
+ SQXTN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTN2 v0.16b, v24.8h
+ SQXTN2 v1.16b, v25.8h
+ SQXTN2 v2.16b, v26.8h
+ SQXTN2 v3.16b, v27.8h
+ SUB x11, x11, 7 // rewind params pointer
+
+ SMAX v0.16b, v0.16b, v4.16b
+ SMAX v1.16b, v1.16b, v4.16b
+ SMAX v2.16b, v2.16b, v4.16b
+ SMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ SMIN v0.16b, v0.16b, v5.16b
+ SMIN v1.16b, v1.16b, v5.16b
+ SMIN v2.16b, v2.16b, v5.16b
+ SMIN v3.16b, v3.16b, v5.16b
+ B.LO 5f
+
+ # Store full 4 x 16
+ ST1 {v3.16b}, [x7], x10
+ ST1 {v2.16b}, [x17], x10
+ ST1 {v1.16b}, [x16], x10
+ ST1 {v0.16b}, [x6], x10
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+4:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x13], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], x0
+ LD1 {v2.8b}, [x15], x0
+ LD1 {v3.8b}, [x20], x0
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+5:
+ TBZ x1, 3, 6f
+ STR d3, [x7], 8
+ STR d2, [x17], 8
+ DUP d3, v3.d[1]
+ DUP d2, v2.d[1]
+ STR d1, [x16], 8
+ STR d0, [x6], 8
+ DUP d1, v1.d[1]
+ DUP d0, v0.d[1]
+6:
+ TBZ x1, 2, 7f
+ STR s3, [x7], 4
+ STR s2, [x17], 4
+ DUP s3, v3.s[1]
+ DUP s2, v2.s[1]
+ STR s1, [x16], 4
+ STR s0, [x6], 4
+ DUP s1, v1.s[1]
+ DUP s0, v0.s[1]
+7:
+ TBZ x1, 1, 8f
+ STR h3, [x7], 2
+ STR h2, [x17], 2
+ DUP h3, v3.h[1]
+ DUP h2, v2.h[1]
+ STR h1, [x16], 2
+ STR h0, [x6], 2
+ DUP h1, v1.h[1]
+ DUP h0, v0.h[1]
+8:
+ TBZ x1, 0, 9f
+ STR b3, [x7]
+ STR b2, [x17]
+ STR b1, [x16]
+ STR b0, [x6]
+9:
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..201f541
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,615 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t** restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x10
+# size_t a_offset, [sp + 8] -> x8
+# const int8_t* zero, [sp + 16] -> x12
+# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x20 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x16 v17 v21 v25 v29
+# C2 x17 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x10, x8, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ LDP x12, x11, [sp, 16] // Load zero, params pointer
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x20, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x8 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x8 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x8 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x20, x12 // if a3 == zero
+ ADD x20, x20, x8 // a3 += a_offset
+ CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 8 bytes for main loop?
+ SUBS x0, x2, 8 // k = kc - 8
+ B.LO 4f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+2:
+ LD1 {v0.8b}, [x13], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], 8
+ LD1 {v2.8b}, [x15], 8
+ LD1 {v3.8b}, [x20], 8
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 2b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 4f
+
+3:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+ # Apply params - preshift, scale, postshift, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SSHL v16.4s, v16.4s, v4.4s // shift to upper bits
+ SSHL v17.4s, v17.4s, v4.4s
+ SSHL v18.4s, v18.4s, v4.4s
+ SSHL v19.4s, v19.4s, v4.4s
+ SSHL v20.4s, v20.4s, v4.4s
+ SSHL v21.4s, v21.4s, v4.4s
+ SSHL v22.4s, v22.4s, v4.4s
+ SSHL v23.4s, v23.4s, v4.4s
+ LD1R {v5.4s}, [x11], 4
+ SSHL v24.4s, v24.4s, v4.4s
+ SSHL v25.4s, v25.4s, v4.4s
+ SSHL v26.4s, v26.4s, v4.4s
+ SSHL v27.4s, v27.4s, v4.4s
+ SSHL v28.4s, v28.4s, v4.4s
+ SSHL v29.4s, v29.4s, v4.4s
+ SSHL v30.4s, v30.4s, v4.4s
+ SSHL v31.4s, v31.4s, v4.4s
+ LD1R {v6.4s}, [x11], 4
+ SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding
+ SQDMULH v17.4s, v17.4s, v5.4s
+ SQDMULH v18.4s, v18.4s, v5.4s
+ SQDMULH v19.4s, v19.4s, v5.4s
+ SQDMULH v20.4s, v20.4s, v5.4s
+ SQDMULH v21.4s, v21.4s, v5.4s
+ SQDMULH v22.4s, v22.4s, v5.4s
+ SQDMULH v23.4s, v23.4s, v5.4s
+ SQDMULH v24.4s, v24.4s, v5.4s
+ SQDMULH v25.4s, v25.4s, v5.4s
+ SQDMULH v26.4s, v26.4s, v5.4s
+ SQDMULH v27.4s, v27.4s, v5.4s
+ SQDMULH v28.4s, v28.4s, v5.4s
+ SQDMULH v29.4s, v29.4s, v5.4s
+ SQDMULH v30.4s, v30.4s, v5.4s
+ SQDMULH v31.4s, v31.4s, v5.4s
+ SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v6.4s
+ SRSHL v18.4s, v18.4s, v6.4s
+ SRSHL v19.4s, v19.4s, v6.4s
+ SRSHL v20.4s, v20.4s, v6.4s
+ SRSHL v21.4s, v21.4s, v6.4s
+ SRSHL v22.4s, v22.4s, v6.4s
+ SRSHL v23.4s, v23.4s, v6.4s
+ SRSHL v24.4s, v24.4s, v6.4s
+ SRSHL v25.4s, v25.4s, v6.4s
+ SRSHL v26.4s, v26.4s, v6.4s
+ SRSHL v27.4s, v27.4s, v6.4s
+ SRSHL v28.4s, v28.4s, v6.4s
+ SRSHL v29.4s, v29.4s, v6.4s
+ SRSHL v30.4s, v30.4s, v6.4s
+ SRSHL v31.4s, v31.4s, v6.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTN v0.8b, v16.8h
+ SQXTN v1.8b, v17.8h
+ SQXTN v2.8b, v18.8h
+ SQXTN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTN2 v0.16b, v24.8h
+ SQXTN2 v1.16b, v25.8h
+ SQXTN2 v2.16b, v26.8h
+ SQXTN2 v3.16b, v27.8h
+ SUB x11, x11, 15 // rewind params pointer
+
+ SMAX v0.16b, v0.16b, v4.16b
+ SMAX v1.16b, v1.16b, v4.16b
+ SMAX v2.16b, v2.16b, v4.16b
+ SMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ SMIN v0.16b, v0.16b, v5.16b
+ SMIN v1.16b, v1.16b, v5.16b
+ SMIN v2.16b, v2.16b, v5.16b
+ SMIN v3.16b, v3.16b, v5.16b
+ B.LO 5f
+
+ # Store full 4 x 16
+ ST1 {v3.16b}, [x7], x10
+ ST1 {v2.16b}, [x17], x10
+ ST1 {v1.16b}, [x16], x10
+ ST1 {v0.16b}, [x6], x10
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+4:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x13], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], x0
+ LD1 {v2.8b}, [x15], x0
+ LD1 {v3.8b}, [x20], x0
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+5:
+ TBZ x1, 3, 6f
+ STR d3, [x7], 8
+ STR d2, [x17], 8
+ DUP d3, v3.d[1]
+ DUP d2, v2.d[1]
+ STR d1, [x16], 8
+ STR d0, [x6], 8
+ DUP d1, v1.d[1]
+ DUP d0, v0.d[1]
+6:
+ TBZ x1, 2, 7f
+ STR s3, [x7], 4
+ STR s2, [x17], 4
+ DUP s3, v3.s[1]
+ DUP s2, v2.s[1]
+ STR s1, [x16], 4
+ STR s0, [x6], 4
+ DUP s1, v1.s[1]
+ DUP s0, v0.s[1]
+7:
+ TBZ x1, 1, 8f
+ STR h3, [x7], 2
+ STR h2, [x17], 2
+ DUP h3, v3.h[1]
+ DUP h2, v2.h[1]
+ STR h1, [x16], 2
+ STR h0, [x6], 2
+ DUP h1, v1.h[1]
+ DUP h0, v0.h[1]
+8:
+ TBZ x1, 0, 9f
+ STR b3, [x7]
+ STR b2, [x17]
+ STR b1, [x16]
+ STR b0, [x6]
+9:
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..a9eabe1
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,621 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t** restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x10
+# size_t a_offset, [sp + 8] -> x8
+# const int8_t* zero, [sp + 16] -> x12
+# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x20 v3
+# B x5 v4 v5 v6
+# C0 x6 v16 v20 v24 v28
+# C1 x16 v17 v21 v25 v29
+# C2 x17 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
+# x11, x21 temp for Cortex-A53 loads
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x10, x8, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ LDP x12, x11, [sp, 16] // Load zero, params pointer
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x20, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x8 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x8 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x8 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x20, x12 // if a3 == zero
+ ADD x20, x20, x8 // a3 += a_offset
+ CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 8 bytes for main loop?
+ SUBS x0, x2, 8 // k = kc - 8
+ B.LO 4f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+2:
+ LD1 {v0.8b}, [x13], 8
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], 8
+ LD1 {v2.8b}, [x15], 8
+ LD1 {v3.8b}, [x20], 8
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x13, 128]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x14, 128]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x15, 128]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x20, 128]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 448]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 512]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[7]
+ SMLAL2 v20.4s, v4.8h, v0.h[7]
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v17.4s, v4.4h, v1.h[7]
+ SMLAL2 v21.4s, v4.8h, v1.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v18.4s, v4.4h, v2.h[7]
+ SMLAL2 v22.4s, v4.8h, v2.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v19.4s, v4.4h, v3.h[7]
+ SMLAL2 v23.4s, v4.8h, v3.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 2b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 4f
+
+3:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+ # Apply params - preshift, scale, postshift, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SSHL v16.4s, v16.4s, v4.4s // shift to upper bits
+ SSHL v17.4s, v17.4s, v4.4s
+ SSHL v18.4s, v18.4s, v4.4s
+ SSHL v19.4s, v19.4s, v4.4s
+ SSHL v20.4s, v20.4s, v4.4s
+ SSHL v21.4s, v21.4s, v4.4s
+ SSHL v22.4s, v22.4s, v4.4s
+ SSHL v23.4s, v23.4s, v4.4s
+ LD1R {v5.4s}, [x11], 4
+ SSHL v24.4s, v24.4s, v4.4s
+ SSHL v25.4s, v25.4s, v4.4s
+ SSHL v26.4s, v26.4s, v4.4s
+ SSHL v27.4s, v27.4s, v4.4s
+ SSHL v28.4s, v28.4s, v4.4s
+ SSHL v29.4s, v29.4s, v4.4s
+ SSHL v30.4s, v30.4s, v4.4s
+ SSHL v31.4s, v31.4s, v4.4s
+ LD1R {v6.4s}, [x11], 4
+ SQDMULH v16.4s, v16.4s, v5.4s // scale without rounding
+ SQDMULH v17.4s, v17.4s, v5.4s
+ SQDMULH v18.4s, v18.4s, v5.4s
+ SQDMULH v19.4s, v19.4s, v5.4s
+ SQDMULH v20.4s, v20.4s, v5.4s
+ SQDMULH v21.4s, v21.4s, v5.4s
+ SQDMULH v22.4s, v22.4s, v5.4s
+ SQDMULH v23.4s, v23.4s, v5.4s
+ SQDMULH v24.4s, v24.4s, v5.4s
+ SQDMULH v25.4s, v25.4s, v5.4s
+ SQDMULH v26.4s, v26.4s, v5.4s
+ SQDMULH v27.4s, v27.4s, v5.4s
+ SQDMULH v28.4s, v28.4s, v5.4s
+ SQDMULH v29.4s, v29.4s, v5.4s
+ SQDMULH v30.4s, v30.4s, v5.4s
+ SQDMULH v31.4s, v31.4s, v5.4s
+ SRSHL v16.4s, v16.4s, v6.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v6.4s
+ SRSHL v18.4s, v18.4s, v6.4s
+ SRSHL v19.4s, v19.4s, v6.4s
+ SRSHL v20.4s, v20.4s, v6.4s
+ SRSHL v21.4s, v21.4s, v6.4s
+ SRSHL v22.4s, v22.4s, v6.4s
+ SRSHL v23.4s, v23.4s, v6.4s
+ SRSHL v24.4s, v24.4s, v6.4s
+ SRSHL v25.4s, v25.4s, v6.4s
+ SRSHL v26.4s, v26.4s, v6.4s
+ SRSHL v27.4s, v27.4s, v6.4s
+ SRSHL v28.4s, v28.4s, v6.4s
+ SRSHL v29.4s, v29.4s, v6.4s
+ SRSHL v30.4s, v30.4s, v6.4s
+ SRSHL v31.4s, v31.4s, v6.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTN v0.8b, v16.8h
+ SQXTN v1.8b, v17.8h
+ SQXTN v2.8b, v18.8h
+ SQXTN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTN2 v0.16b, v24.8h
+ SQXTN2 v1.16b, v25.8h
+ SQXTN2 v2.16b, v26.8h
+ SQXTN2 v3.16b, v27.8h
+ SUB x11, x11, 15 // rewind params pointer
+
+ SMAX v0.16b, v0.16b, v4.16b
+ SMAX v1.16b, v1.16b, v4.16b
+ SMAX v2.16b, v2.16b, v4.16b
+ SMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ SMIN v0.16b, v0.16b, v5.16b
+ SMIN v1.16b, v1.16b, v5.16b
+ SMIN v2.16b, v2.16b, v5.16b
+ SMIN v3.16b, v3.16b, v5.16b
+ B.LO 5f
+
+ # Store full 4 x 16
+ ST1 {v3.16b}, [x7], x10
+ ST1 {v2.16b}, [x17], x10
+ ST1 {v1.16b}, [x16], x10
+ ST1 {v0.16b}, [x6], x10
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+4:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x13], x0
+ LDP d4, d5, [x5], 16
+ LD1 {v1.8b}, [x14], x0
+ LD1 {v2.8b}, [x15], x0
+ LD1 {v3.8b}, [x20], x0
+ SXTL v0.8h, v0.8b
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v16.4s, v4.4h, v0.h[0]
+ SMLAL2 v20.4s, v4.8h, v0.h[0]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v17.4s, v4.4h, v1.h[0]
+ SMLAL2 v21.4s, v4.8h, v1.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v18.4s, v4.4h, v2.h[0]
+ SMLAL2 v22.4s, v4.8h, v2.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v19.4s, v4.4h, v3.h[0]
+ SMLAL2 v23.4s, v4.8h, v3.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[1]
+ SMLAL2 v20.4s, v4.8h, v0.h[1]
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v17.4s, v4.4h, v1.h[1]
+ SMLAL2 v21.4s, v4.8h, v1.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v18.4s, v4.4h, v2.h[1]
+ SMLAL2 v22.4s, v4.8h, v2.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v19.4s, v4.4h, v3.h[1]
+ SMLAL2 v23.4s, v4.8h, v3.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[2]
+ SMLAL2 v20.4s, v4.8h, v0.h[2]
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v17.4s, v4.4h, v1.h[2]
+ SMLAL2 v21.4s, v4.8h, v1.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v18.4s, v4.4h, v2.h[2]
+ SMLAL2 v22.4s, v4.8h, v2.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v19.4s, v4.4h, v3.h[2]
+ SMLAL2 v23.4s, v4.8h, v3.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[3]
+ SMLAL2 v20.4s, v4.8h, v0.h[3]
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v17.4s, v4.4h, v1.h[3]
+ SMLAL2 v21.4s, v4.8h, v1.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v18.4s, v4.4h, v2.h[3]
+ SMLAL2 v22.4s, v4.8h, v2.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v19.4s, v4.4h, v3.h[3]
+ SMLAL2 v23.4s, v4.8h, v3.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[4]
+ SMLAL2 v20.4s, v4.8h, v0.h[4]
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v17.4s, v4.4h, v1.h[4]
+ SMLAL2 v21.4s, v4.8h, v1.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v18.4s, v4.4h, v2.h[4]
+ SMLAL2 v22.4s, v4.8h, v2.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v19.4s, v4.4h, v3.h[4]
+ SMLAL2 v23.4s, v4.8h, v3.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[5]
+ SMLAL2 v20.4s, v4.8h, v0.h[5]
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v17.4s, v4.4h, v1.h[5]
+ SMLAL2 v21.4s, v4.8h, v1.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v18.4s, v4.4h, v2.h[5]
+ SMLAL2 v22.4s, v4.8h, v2.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v19.4s, v4.4h, v3.h[5]
+ SMLAL2 v23.4s, v4.8h, v3.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 3b
+
+ LDP d4, d5, [x5], 16
+ SXTL v4.8h, v4.8b
+ SXTL v5.8h, v5.8b
+ SMLAL v16.4s, v4.4h, v0.h[6]
+ SMLAL2 v20.4s, v4.8h, v0.h[6]
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v17.4s, v4.4h, v1.h[6]
+ SMLAL2 v21.4s, v4.8h, v1.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v18.4s, v4.4h, v2.h[6]
+ SMLAL2 v22.4s, v4.8h, v2.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v19.4s, v4.4h, v3.h[6]
+ SMLAL2 v23.4s, v4.8h, v3.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+5:
+ TBZ x1, 3, 6f
+ STR d3, [x7], 8
+ STR d2, [x17], 8
+ DUP d3, v3.d[1]
+ DUP d2, v2.d[1]
+ STR d1, [x16], 8
+ STR d0, [x6], 8
+ DUP d1, v1.d[1]
+ DUP d0, v0.d[1]
+6:
+ TBZ x1, 2, 7f
+ STR s3, [x7], 4
+ STR s2, [x17], 4
+ DUP s3, v3.s[1]
+ DUP s2, v2.s[1]
+ STR s1, [x16], 4
+ STR s0, [x6], 4
+ DUP s1, v1.s[1]
+ DUP s0, v0.s[1]
+7:
+ TBZ x1, 1, 8f
+ STR h3, [x7], 2
+ STR h2, [x17], 2
+ DUP h3, v3.h[1]
+ DUP h2, v2.h[1]
+ STR h1, [x16], 2
+ STR h0, [x6], 2
+ DUP h1, v1.h[1]
+ DUP h0, v0.h[1]
+8:
+ TBZ x1, 0, 9f
+ STR b3, [x7]
+ STR b2, [x17]
+ STR b1, [x16]
+ STR b0, [x6]
+9:
+ # Restore x20-x21 from stack
+ LDP x20, x21, [sp], 16
+ RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 236898f..81c4ed4 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -1029,15 +1029,21 @@
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
-
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32)
@@ -1390,6 +1396,9 @@
DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
+DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+
DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32)
DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64)
DECLARE_QC8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 1c47ddc..935ec5f 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -851,15 +851,21 @@
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__aarch64_neon_mlal_prfm_cortex_a53)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c16__aarch64_neon_mlal)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
-DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
-
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55)
@@ -1146,6 +1152,9 @@
DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
+DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
+
DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64)
DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128)
DECLARE_QC8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55)
diff --git a/test/qc8-gemm-minmax-fp32.cc b/test/qc8-gemm-minmax-fp32.cc
index f36b52b..4413640 100644
--- a/test/qc8-gemm-minmax-fp32.cc
+++ b/test/qc8-gemm-minmax-fp32.cc
@@ -21911,6 +21911,918 @@
#if XNN_ARCH_ARM64
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QC8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
TEST(QC8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
TEST_REQUIRES_ARM_NEON_DOT;
GemmMicrokernelTester()
diff --git a/test/qc8-gemm-minmax-fp32.yaml b/test/qc8-gemm-minmax-fp32.yaml
index c7637ca..b2ca2a3 100644
--- a/test/qc8-gemm-minmax-fp32.yaml
+++ b/test/qc8-gemm-minmax-fp32.yaml
@@ -147,6 +147,12 @@
- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
init: xnn_init_qs8_minmax_neon_params
k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+ init: xnn_init_qs8_minmax_neon_params
+ k-block: 8
+- name: xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+ init: xnn_init_qs8_minmax_neon_params
+ k-block: 8
- name: xnn_qc8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32
init: xnn_init_qs8_minmax_neon_params
k-block: 4
diff --git a/test/qc8-igemm-minmax-fp32.cc b/test/qc8-igemm-minmax-fp32.cc
index 035e068..850afa7 100644
--- a/test/qc8-igemm-minmax-fp32.cc
+++ b/test/qc8-igemm-minmax-fp32.cc
@@ -21083,7 +21083,7 @@
#if XNN_ARCH_ARM64
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
.mr(4)
@@ -21093,10 +21093,10 @@
.m(4)
.n(16)
.k(8)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cn) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
.mr(4)
@@ -21107,10 +21107,10 @@
.n(16)
.k(8)
.cn_stride(19)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t m = 1; m <= 4; m++) {
for (uint32_t n = 1; n <= 16; n++) {
@@ -21123,12 +21123,12 @@
.n(n)
.k(8)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_m) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t m = 1; m <= 4; m++) {
GemmMicrokernelTester()
@@ -21140,11 +21140,11 @@
.n(16)
.k(8)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8_subtile_n) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 1; n <= 16; n++) {
GemmMicrokernelTester()
@@ -21156,11 +21156,11 @@
.n(n)
.k(8)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 1; k < 8; k++) {
GemmMicrokernelTester()
@@ -21171,11 +21171,11 @@
.m(4)
.n(16)
.k(k)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_lt_8_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 1; k < 8; k++) {
for (uint32_t m = 1; m <= 4; m++) {
@@ -21189,13 +21189,13 @@
.n(n)
.k(k)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 9; k < 16; k++) {
GemmMicrokernelTester()
@@ -21206,11 +21206,11 @@
.m(4)
.n(16)
.k(k)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_gt_8_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 9; k < 16; k++) {
for (uint32_t m = 1; m <= 4; m++) {
@@ -21224,13 +21224,13 @@
.n(n)
.k(k)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 16; k <= 80; k += 8) {
GemmMicrokernelTester()
@@ -21241,11 +21241,11 @@
.m(4)
.n(16)
.k(k)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_div_8_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 16; k <= 80; k += 8) {
for (uint32_t m = 1; m <= 4; m++) {
@@ -21259,13 +21259,13 @@
.n(n)
.k(k)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 17; n < 32; n++) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21277,12 +21277,12 @@
.m(4)
.n(16)
.k(k)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_strided_cn) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 17; n < 32; n++) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21295,12 +21295,12 @@
.n(16)
.k(k)
.cn_stride(19)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 17; n < 32; n++) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21314,13 +21314,13 @@
.n(n)
.k(k)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 32; n <= 48; n += 16) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21332,12 +21332,12 @@
.m(4)
.n(16)
.k(k)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_strided_cn) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 32; n <= 48; n += 16) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21350,12 +21350,12 @@
.n(n)
.k(k)
.cn_stride(19)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 32; n <= 48; n += 16) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21369,13 +21369,13 @@
.n(n)
.k(k)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, small_kernel) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 1; k <= 40; k += 9) {
GemmMicrokernelTester()
@@ -21387,11 +21387,11 @@
.n(16)
.k(k)
.ks(3)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, small_kernel_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 1; k <= 40; k += 9) {
for (uint32_t m = 1; m <= 4; m++) {
@@ -21406,13 +21406,13 @@
.k(k)
.ks(3)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_gt_16_small_kernel) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_small_kernel) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 17; n < 32; n++) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21425,12 +21425,12 @@
.n(16)
.k(k)
.ks(3)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, n_div_16_small_kernel) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_small_kernel) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 32; n <= 48; n += 16) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21443,12 +21443,12 @@
.n(16)
.k(k)
.ks(3)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 1; k <= 40; k += 9) {
for (uint32_t m = 1; m <= 4; m++) {
@@ -21463,13 +21463,13 @@
.k(k)
.cm_stride(19)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, a_offset) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 1; k <= 40; k += 9) {
GemmMicrokernelTester()
@@ -21482,11 +21482,11 @@
.k(k)
.ks(3)
.a_offset(163)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, zero) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, zero) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t mz = 0; mz < 4; mz++) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21501,12 +21501,12 @@
.ks(3)
.a_offset(163)
.zero_index(mz)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmin) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
.mr(4)
@@ -21517,10 +21517,10 @@
.n(16)
.k(8)
.qmin(128)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, qmax) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
.mr(4)
@@ -21531,10 +21531,10 @@
.n(16)
.k(8)
.qmax(128)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, strided_cm) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
.mr(4)
@@ -21545,13 +21545,13 @@
.n(16)
.k(8)
.cm_stride(19)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
#endif // XNN_ARCH_ARM64
#if XNN_ARCH_ARM64
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
.mr(4)
@@ -21561,10 +21561,10 @@
.m(4)
.n(16)
.k(8)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, strided_cn) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
.mr(4)
@@ -21575,10 +21575,10 @@
.n(16)
.k(8)
.cn_stride(19)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t m = 1; m <= 4; m++) {
for (uint32_t n = 1; n <= 16; n++) {
@@ -21591,12 +21591,12 @@
.n(n)
.k(8)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile_m) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t m = 1; m <= 4; m++) {
GemmMicrokernelTester()
@@ -21608,11 +21608,11 @@
.n(16)
.k(8)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_eq_8_subtile_n) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 1; n <= 16; n++) {
GemmMicrokernelTester()
@@ -21624,11 +21624,11 @@
.n(n)
.k(8)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_lt_8) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 1; k < 8; k++) {
GemmMicrokernelTester()
@@ -21639,11 +21639,11 @@
.m(4)
.n(16)
.k(k)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_lt_8_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 1; k < 8; k++) {
for (uint32_t m = 1; m <= 4; m++) {
@@ -21657,13 +21657,13 @@
.n(n)
.k(k)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_gt_8) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 9; k < 16; k++) {
GemmMicrokernelTester()
@@ -21674,11 +21674,11 @@
.m(4)
.n(16)
.k(k)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_gt_8_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 9; k < 16; k++) {
for (uint32_t m = 1; m <= 4; m++) {
@@ -21692,13 +21692,13 @@
.n(n)
.k(k)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_div_8) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 16; k <= 80; k += 8) {
GemmMicrokernelTester()
@@ -21709,11 +21709,11 @@
.m(4)
.n(16)
.k(k)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, k_div_8_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 16; k <= 80; k += 8) {
for (uint32_t m = 1; m <= 4; m++) {
@@ -21727,13 +21727,13 @@
.n(n)
.k(k)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_gt_16) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 17; n < 32; n++) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21745,12 +21745,12 @@
.m(4)
.n(16)
.k(k)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_gt_16_strided_cn) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 17; n < 32; n++) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21763,12 +21763,12 @@
.n(16)
.k(k)
.cn_stride(19)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_gt_16_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 17; n < 32; n++) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21782,13 +21782,13 @@
.n(n)
.k(k)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_div_16) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 32; n <= 48; n += 16) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21800,12 +21800,12 @@
.m(4)
.n(16)
.k(k)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_div_16_strided_cn) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 32; n <= 48; n += 16) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21818,12 +21818,12 @@
.n(n)
.k(k)
.cn_stride(19)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_div_16_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 32; n <= 48; n += 16) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21837,13 +21837,13 @@
.n(n)
.k(k)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, small_kernel) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 1; k <= 40; k += 9) {
GemmMicrokernelTester()
@@ -21855,11 +21855,11 @@
.n(16)
.k(k)
.ks(3)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, small_kernel_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 1; k <= 40; k += 9) {
for (uint32_t m = 1; m <= 4; m++) {
@@ -21874,13 +21874,13 @@
.k(k)
.ks(3)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_gt_16_small_kernel) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_small_kernel) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 17; n < 32; n++) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21893,12 +21893,12 @@
.n(16)
.k(k)
.ks(3)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, n_div_16_small_kernel) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_small_kernel) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t n = 32; n <= 48; n += 16) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21911,12 +21911,12 @@
.n(16)
.k(k)
.ks(3)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, strided_cm_subtile) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 1; k <= 40; k += 9) {
for (uint32_t m = 1; m <= 4; m++) {
@@ -21931,13 +21931,13 @@
.k(k)
.cm_stride(19)
.iterations(1)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, a_offset) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, a_offset) {
TEST_REQUIRES_ARM_NEON;
for (size_t k = 1; k <= 40; k += 9) {
GemmMicrokernelTester()
@@ -21950,11 +21950,11 @@
.k(k)
.ks(3)
.a_offset(163)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, zero) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, zero) {
TEST_REQUIRES_ARM_NEON;
for (uint32_t mz = 0; mz < 4; mz++) {
for (size_t k = 1; k <= 40; k += 9) {
@@ -21969,12 +21969,12 @@
.ks(3)
.a_offset(163)
.zero_index(mz)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
}
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, qmin) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
.mr(4)
@@ -21985,10 +21985,10 @@
.n(16)
.k(8)
.qmin(128)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, qmax) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
.mr(4)
@@ -21999,10 +21999,10 @@
.n(16)
.k(8)
.qmax(128)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
- TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_CORTEX_A53, strided_cm) {
+ TEST(QC8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
.mr(4)
@@ -22013,7 +22013,7 @@
.n(16)
.k(8)
.cm_stride(19)
- .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ .Test(xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_minmax_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
}
#endif // XNN_ARCH_ARM64
diff --git a/test/qc8-igemm-minmax-fp32.yaml b/test/qc8-igemm-minmax-fp32.yaml
index 97a9aa7..ce31853 100644
--- a/test/qc8-igemm-minmax-fp32.yaml
+++ b/test/qc8-igemm-minmax-fp32.yaml
@@ -138,10 +138,10 @@
- name: xnn_qc8_igemm_minmax_fp32_ukernel_1x8c8__aarch64_neon_mlal_prfm_cortex_a53
init: xnn_init_qs8_minmax_neon_params
k-block: 16
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
init: xnn_init_qs8_minmax_neon_params
k-block: 8
-- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
+- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
init: xnn_init_qs8_minmax_neon_params
k-block: 8
- name: xnn_qc8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
diff --git a/test/qs8-gemm-minmax-fp32.cc b/test/qs8-gemm-minmax-fp32.cc
index 7447cf5..0983a12 100644
--- a/test/qs8-gemm-minmax-fp32.cc
+++ b/test/qs8-gemm-minmax-fp32.cc
@@ -21911,6 +21911,918 @@
#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
TEST_REQUIRES_ARM_NEON_DOT;
GemmMicrokernelTester()
diff --git a/test/qs8-gemm-minmax-fp32.yaml b/test/qs8-gemm-minmax-fp32.yaml
index 2e33bed..32df48e 100644
--- a/test/qs8-gemm-minmax-fp32.yaml
+++ b/test/qs8-gemm-minmax-fp32.yaml
@@ -147,6 +147,12 @@
- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
k-block: 8
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 8
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 8
- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32
init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
k-block: 4
diff --git a/test/qs8-gemm-minmax-rndnu.cc b/test/qs8-gemm-minmax-rndnu.cc
index 7fc86d4..f7fcb6b 100644
--- a/test/qs8-gemm-minmax-rndnu.cc
+++ b/test/qs8-gemm-minmax-rndnu.cc
@@ -71159,6 +71159,918 @@
#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
TEST(QS8_GEMM_MINMAX_RNDNU_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
TEST_REQUIRES_ARM_NEON_DOT;
GemmMicrokernelTester()
diff --git a/test/qs8-gemm-minmax-rndnu.yaml b/test/qs8-gemm-minmax-rndnu.yaml
index e737786..f514217 100644
--- a/test/qs8-gemm-minmax-rndnu.yaml
+++ b/test/qs8-gemm-minmax-rndnu.yaml
@@ -471,6 +471,12 @@
- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
init: xnn_init_qs8_conv_minmax_rndnu_neon_params
k-block: 8
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+ init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+ k-block: 8
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+ init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+ k-block: 8
- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x16c4__aarch64_neondot_ld32
init: xnn_init_qs8_conv_minmax_rndnu_neon_params
k-block: 4
diff --git a/test/qs8-igemm-minmax-fp32.cc b/test/qs8-igemm-minmax-fp32.cc
index 7e73ff8..09d3bde 100644
--- a/test/qs8-igemm-minmax-fp32.cc
+++ b/test/qs8-igemm-minmax-fp32.cc
@@ -22019,6 +22019,942 @@
#if XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
TEST_REQUIRES_ARM_NEON_DOT;
GemmMicrokernelTester()
diff --git a/test/qs8-igemm-minmax-fp32.yaml b/test/qs8-igemm-minmax-fp32.yaml
index 8403032..c5cd1a0 100644
--- a/test/qs8-igemm-minmax-fp32.yaml
+++ b/test/qs8-igemm-minmax-fp32.yaml
@@ -144,6 +144,12 @@
- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
k-block: 8
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 8
- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
k-block: 16
diff --git a/test/qs8-igemm-minmax-rndnu.cc b/test/qs8-igemm-minmax-rndnu.cc
index 7a10c55..5738241 100644
--- a/test/qs8-igemm-minmax-rndnu.cc
+++ b/test/qs8-igemm-minmax-rndnu.cc
@@ -72563,6 +72563,942 @@
#if XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X16__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
TEST(QS8_IGEMM_MINMAX_RNDNU_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
TEST_REQUIRES_ARM_NEON_DOT;
GemmMicrokernelTester()
diff --git a/test/qs8-igemm-minmax-rndnu.yaml b/test/qs8-igemm-minmax-rndnu.yaml
index 5d426f9..10eaa39 100644
--- a/test/qs8-igemm-minmax-rndnu.yaml
+++ b/test/qs8-igemm-minmax-rndnu.yaml
@@ -468,6 +468,12 @@
- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53
init: xnn_init_qs8_conv_minmax_rndnu_neon_params
k-block: 8
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
+ init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
+ init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+ k-block: 8
- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55
init: xnn_init_qs8_conv_minmax_rndnu_neon_params
k-block: 16