Aarch64 4x8 lane ld64 GEMM/IGEMM microkernels.
- Based on 4x16 microkernel, reduced to 4x8 size.
- Update register usage comments and push/pop for 4x16 IGEMM.
PiperOrigin-RevId: 416107685
diff --git a/BUILD.bazel b/BUILD.bazel
index 12ba1f2..fc36bbc 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4282,9 +4282,9 @@
"src/qc8-igemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
"src/qs8-dwconv/gen/up8x9-minmax-fp32-sse2-mul16-add16.c",
"src/qs8-dwconv/gen/up8x25-minmax-fp32-sse2-mul16-add16.c",
+ "src/qs8-f32-vcvt/gen/vcvt-sse2-x32.c",
"src/qs8-gavgpool/gen/7p7x-minmax-sse2-c8-acc2.c",
"src/qs8-gavgpool/gen/7x-minmax-sse2-c8-acc2.c",
- "src/qs8-f32-vcvt/gen/vcvt-sse2-x32.c",
"src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
"src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
"src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
@@ -6438,6 +6438,8 @@
"src/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mull.S",
"src/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S",
"src/qs8-gemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S",
+ "src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
+ "src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
"src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S",
"src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
"src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
@@ -6472,6 +6474,8 @@
"src/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal.S",
"src/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S",
"src/qs8-igemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S",
+ "src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S",
+ "src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
"src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S",
"src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S",
"src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9baf826..4c16652 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5354,6 +5354,8 @@
src/qs8-gemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mull.S
src/qs8-gemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S
src/qs8-gemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S
+ src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+ src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
@@ -5388,6 +5390,8 @@
src/qs8-igemm/gen/2x8c8-minmax-rndnu-aarch64-neon-mlal.S
src/qs8-igemm/gen/2x8c16-minmax-fp32-aarch64-neon-mlal.S
src/qs8-igemm/gen/2x8c16-minmax-rndnu-aarch64-neon-mlal.S
+ src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+ src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-cortex-a53.S
src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-cortex-a53.S
diff --git a/bench/qs8-gemm-e2e.cc b/bench/qs8-gemm-e2e.cc
index a86b834..a88f1f2 100644
--- a/bench/qs8-gemm-e2e.cc
+++ b/bench/qs8-gemm-e2e.cc
@@ -116,6 +116,26 @@
4 /* mr */, 16 /* nr */, 2 /* log2_kr */, 0 /* log2_sr */,
benchmark::utils::CheckNEONDOT);
}
+ static void qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, models::ExecutionPlanFactory model) {
+ GEMMEnd2EndBenchmark(state, model,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64,
+ xnn_qs8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
+ xnn_qs8_igemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params,
+ 4 /* mr */, 8 /* nr */, 0 /* log2_kr */, 0 /* log2_sr */,
+ benchmark::utils::CheckNEON);
+ }
static void qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, models::ExecutionPlanFactory model) {
GEMMEnd2EndBenchmark(state, model,
xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
@@ -201,6 +221,8 @@
BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld32)
BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld64)
BENCHMARK_QS8_END2END(qs8_gemm_4x16c4__aarch64_neondot_ld128)
+ BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64)
+ BENCHMARK_QS8_END2END(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)
BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
BENCHMARK_QS8_END2END(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index c774fe9..0eb4989 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -210,7 +210,6 @@
}
#endif // BENCHMARK_RUY
-
#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
static void qs8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55, 4, 16, 4, 1,
@@ -236,6 +235,14 @@
GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128, 4, 16, 4, 1,
xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEONDOT);
}
+ static void qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, 4, 8, 1, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
+ static void qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, 4, 8, 1, 1,
+ xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
+ }
static void qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
GEMMBenchmark(state, xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53, 4, 16, 1, 1,
xnn_init_qs8_conv_minmax_rndnu_neon_params, benchmark::utils::CheckNEON);
@@ -299,6 +306,8 @@
BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld64)
BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_ld128)
BENCHMARK_GEMM(qs8_gemm_4x16c4__aarch64_neondot_cortex_a55)
+ BENCHMARK_GEMM(qs8_gemm_4x8__aarch64_neon_mlal_lane_ld64)
+ BENCHMARK_GEMM(qs8_gemm_4x8__aarch64_neon_mlal_lane_prfm_ld64)
BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
BENCHMARK_GEMM(qs8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh
index 133ac1d..bc0c023 100755
--- a/scripts/generate-qs8-gemm.sh
+++ b/scripts/generate-qs8-gemm.sh
@@ -618,6 +618,9 @@
############################### AArch64 assembly ##############################
### Cortex-A53 lane micro-kernels
+tools/xngen src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S &
+
tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh
index 44bb33a..4f9ee2d 100755
--- a/scripts/generate-qs8-igemm.sh
+++ b/scripts/generate-qs8-igemm.sh
@@ -606,6 +606,9 @@
############################### AArch64 assembly ##############################
### Cortex-A53 lane micro-kernels
+tools/xngen src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S &
+tools/xngen src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S &
+
tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S &
diff --git a/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
index f9848c0..a3fce54 100644
--- a/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -29,15 +29,13 @@
# A1 x15 v1
# A2 x13 v2
# A3 x4 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x8 v17 v21 v25 v29
# C2 x9 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x10 x17 a53 temp registers
-
BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
# Clamp A and C pointers
diff --git a/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
index 1c45866..81712d0 100644
--- a/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qc8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -29,15 +29,13 @@
# A1 x15 v1
# A2 x13 v2
# A3 x4 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x8 v17 v21 v25 v29
# C2 x9 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x10 x17 a53 temp registers
-
BEGIN_FUNCTION xnn_qc8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
# Clamp A and C pointers
diff --git a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
index ea14f76..26971a6 100644
--- a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -31,13 +31,12 @@
# A1 x14 v1
# A2 x15 v2
# A3 x20 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x16 v17 v21 v25 v29
# C2 x17 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
@@ -53,7 +52,7 @@
CSEL x17, x16, x17, LS // c2 = c1
CMP x0, 4 // if mr < 4
- STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ STR x20, [sp, -16]! // Save x20 on stack
ADD x7, x17, x7 // c3 = c2 + cm_stride
CSEL x7, x17, x7, LO // c3 = c2
@@ -401,8 +400,8 @@
# nc loop
B.HI 0b
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
# Remainder- 1 to 7 bytes of A
@@ -607,8 +606,8 @@
STR b1, [x16]
STR b0, [x6]
9:
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
diff --git a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
index 908e363..fd683a7 100644
--- a/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qc8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -31,13 +31,12 @@
# A1 x14 v1
# A2 x15 v2
# A3 x20 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x16 v17 v21 v25 v29
# C2 x17 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
BEGIN_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
@@ -53,7 +52,7 @@
CSEL x17, x16, x17, LS // c2 = c1
CMP x0, 4 // if mr < 4
- STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ STR x20, [sp, -16]! // Save x20 on stack
ADD x7, x17, x7 // c3 = c2 + cm_stride
CSEL x7, x17, x7, LO // c3 = c2
@@ -407,8 +406,8 @@
# nc loop
B.HI 0b
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
# Remainder- 1 to 7 bytes of A
@@ -613,8 +612,8 @@
STR b1, [x16]
STR b0, [x6]
9:
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
END_FUNCTION xnn_qc8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
diff --git a/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in b/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
index 8490502..612fb18 100644
--- a/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+++ b/src/qs8-gemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
@@ -53,7 +53,7 @@
# A1 x15 v1
# A2 x13 v2
# A3 x4 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x8 v17 v21 v25 v29
# C2 x9 v18 v22 v26 v30
@@ -64,8 +64,6 @@
$else:
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x10 x17 a53 temp registers
-
BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
# Clamp A and C pointers
diff --git a/src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in b/src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
new file mode 100644
index 0000000..b53b96c
--- /dev/null
+++ b/src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
@@ -0,0 +1,528 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert REQUANTIZATION in ["FP32", "RNDNU"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
+$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
+$assert DATATYPE != "QU8" or REQUANTIZATION == "RNDNU"
+
+#include <xnnpack/assembly.h>
+
+$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+$if DATATYPE == "QU8":
+ $REWIND_DECREMENT = 15
+$else:
+ $REWIND_DECREMENT = 3 if CHANNELWISE else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
+$XMIN = "UMIN" if DATATYPE == "QU8" else "SMIN"
+$XMAX = "UMAX" if DATATYPE == "QU8" else "SMAX"
+$XXTL = "UXTL" if DATATYPE == "QU8" else "SXTL"
+$SQXTXN = "SQXTUN" if DATATYPE == "QU8" else "SQXTN"
+$SQXTXN2 = "SQXTUN2" if DATATYPE == "QU8" else "SQXTN2"
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
+# void xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const ${XINT8_T}* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# ${XINT8_T}* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+
+$if REQUANTIZATION == "RNDNU" and DATATYPE == "QU8":
+ # params structure is 20 bytes
+ # struct {
+ # ${XINT8_T} kernel_zero_point[4];
+ # int32_t right_pre_shift;
+ # int32_t multiplier;
+ # int32_t right_post_shift;
+ # int16_t output_zero_point;
+ # ${XINT8_T} output_min;
+ # ${XINT8_T} output_max;
+ # } rndnu_neon;
+ #
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v5
+# C0 x6 v24 v28
+# C1 x8 v25 v29
+# C2 x9 v26 v30
+# C3 x7 v27 v31
+$if DATATYPE == "QU8":
+ # zero_point v7
+ # unused v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+$else:
+ # unused v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+
+BEGIN_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x12, x11, [sp] // Load cn_stride, params
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+ $if DATATYPE == "QU8":
+ LD1R {v7.4s}, [x11], 4 // kernel_zero_point
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q24, q28, [x5], 32
+ SUBS x0, x2, 8 // k = kc - 8
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ # Is there at least 8 bytes for main loop?
+ B.LO 3f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+1:
+ LD1 {v0.8b}, [x3], 8
+ LDR d5, [x5], 8
+ LD1 {v1.8b}, [x15], 8
+ LD1 {v2.8b}, [x13], 8
+ LD1 {v3.8b}, [x4], 8
+ ${XXTL} v0.8h, v0.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ ${XXTL} v1.8h, v1.8b
+ ${XXTL} v2.8h, v2.8b
+ ${XXTL} v3.8h, v3.8b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x13, 128]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x15, 128]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x3, 128]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x4, 128]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 448]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 512]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 1b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 3f
+
+2:
+ $if REQUANTIZATION == "RNDNU":
+ # Apply params - preshift, scale, postshift, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SQSHL v24.4s, v24.4s, v4.4s // shift to upper bits
+ SQSHL v25.4s, v25.4s, v4.4s
+ SQSHL v26.4s, v26.4s, v4.4s
+ SQSHL v27.4s, v27.4s, v4.4s
+ LD1R {v5.4s}, [x11], 4
+ SQSHL v28.4s, v28.4s, v4.4s
+ SQSHL v29.4s, v29.4s, v4.4s
+ SQSHL v30.4s, v30.4s, v4.4s
+ SQSHL v31.4s, v31.4s, v4.4s
+ LD1R {v6.4s}, [x11], 4
+ SQDMULH v24.4s, v24.4s, v5.4s // scale without rounding
+ SQDMULH v25.4s, v25.4s, v5.4s
+ SQDMULH v26.4s, v26.4s, v5.4s
+ SQDMULH v27.4s, v27.4s, v5.4s
+ SQDMULH v28.4s, v28.4s, v5.4s
+ SQDMULH v29.4s, v29.4s, v5.4s
+ SQDMULH v30.4s, v30.4s, v5.4s
+ SQDMULH v31.4s, v31.4s, v5.4s
+ SRSHL v24.4s, v24.4s, v6.4s // signed rounding shift left
+ SRSHL v25.4s, v25.4s, v6.4s
+ SRSHL v26.4s, v26.4s, v6.4s
+ SRSHL v27.4s, v27.4s, v6.4s
+ SRSHL v28.4s, v28.4s, v6.4s
+ SRSHL v29.4s, v29.4s, v6.4s
+ SRSHL v30.4s, v30.4s, v6.4s
+ SRSHL v31.4s, v31.4s, v6.4s
+ $elif REQUANTIZATION == "FP32":
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ $if not CHANNELWISE:
+ # Apply params - scale, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ $else:
+ # Load per channel scale values from weights
+ LDR q4, [x5], 16
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ LDR q5, [x5], 16
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ $if CHANNELWISE:
+ LDR q6, [x5], 16
+ FMUL v24.4s, v24.4s, v6.4s
+ FMUL v25.4s, v25.4s, v6.4s
+ FMUL v26.4s, v26.4s, v6.4s
+ FMUL v27.4s, v27.4s, v6.4s
+ LDR q4, [x5], 16
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+ $else:
+ FMUL v24.4s, v24.4s, v4.4s
+ FMUL v25.4s, v25.4s, v4.4s
+ FMUL v26.4s, v26.4s, v4.4s
+ FMUL v27.4s, v27.4s, v4.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+ LD1R {v4.8b}, [x11], 1 // clamp min value
+
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v5.8b}, [x11] // clamp max value
+ ${SQXTXN} v0.8b, v24.8h
+ ${SQXTXN} v1.8b, v25.8h
+ ${SQXTXN} v2.8b, v26.8h
+ ${SQXTXN} v3.8b, v27.8h
+ SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer
+
+ ${XMAX} v0.8b, v0.8b, v4.8b
+ ${XMAX} v1.8b, v1.8b, v4.8b
+ ${XMAX} v2.8b, v2.8b, v4.8b
+ ${XMAX} v3.8b, v3.8b, v4.8b
+ SUBS x1, x1, 8
+ ${XMIN} v0.8b, v0.8b, v5.8b
+ ${XMIN} v1.8b, v1.8b, v5.8b
+ ${XMIN} v2.8b, v2.8b, v5.8b
+ ${XMIN} v3.8b, v3.8b, v5.8b
+ B.LO 4f
+
+ # Store full 4 x 8
+ ST1 {v0.8b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v1.8b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v2.8b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v3.8b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+3:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x3], x0
+ LDR d5, [x5], 8
+ LD1 {v1.8b}, [x15], x0
+ LD1 {v2.8b}, [x13], x0
+ LD1 {v3.8b}, [x4], x0
+ ${XXTL} v0.8h, v0.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ ${XXTL} v1.8h, v1.8b
+ ${XXTL} v2.8h, v2.8b
+ ${XXTL} v3.8h, v3.8b
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 2b
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 2b
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 2b
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 2b
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 2b
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 2b
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+4:
+ TBZ x1, 2, 5f
+ STR s0, [x6], 4
+ STR s1, [x8], 4
+ DUP s0, v0.s[1]
+ DUP s1, v1.s[1]
+ STR s2, [x9], 4
+ STR s3, [x7], 4
+ DUP s2, v2.s[1]
+ DUP s3, v3.s[1]
+5:
+ TBZ x1, 1, 6f
+ STR h0, [x6], 2
+ STR h1, [x8], 2
+ DUP h0, v0.h[1]
+ DUP h1, v1.h[1]
+ STR h2, [x9], 2
+ STR h3, [x7], 2
+ DUP h2, v2.h[1]
+ DUP h3, v3.h[1]
+6:
+ TBZ x1, 0, 7f
+ STR b0, [x6]
+ STR b1, [x8]
+ STR b2, [x9]
+ STR b3, [x7]
+7:
+ RET
+
+END_FUNCTION xnn_${DATATYPE.lower()}_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
index d552539..1a517b6 100644
--- a/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -29,15 +29,13 @@
# A1 x15 v1
# A2 x13 v2
# A3 x4 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x8 v17 v21 v25 v29
# C2 x9 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x10 x17 a53 temp registers
-
BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
# Clamp A and C pointers
diff --git a/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
index 6f95707..f3567b9 100644
--- a/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qs8-gemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -29,15 +29,13 @@
# A1 x15 v1
# A2 x13 v2
# A3 x4 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x8 v17 v21 v25 v29
# C2 x9 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x10 x17 a53 temp registers
-
BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
# Clamp A and C pointers
diff --git a/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
index 980b41b..e3b2332 100644
--- a/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -29,15 +29,13 @@
# A1 x15 v1
# A2 x13 v2
# A3 x4 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x8 v17 v21 v25 v29
# C2 x9 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x10 x17 a53 temp registers
-
BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
# Clamp A and C pointers
diff --git a/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
index dd64d8e..7fbc9fa 100644
--- a/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -29,15 +29,13 @@
# A1 x15 v1
# A2 x13 v2
# A3 x4 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x8 v17 v21 v25 v29
# C2 x9 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x10 x17 a53 temp registers
-
BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
# Clamp A and C pointers
diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..0c9824a
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-ld64.S
@@ -0,0 +1,746 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x8-aarch32-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64(
+// size_t mr, r0
+// size_t nc, r1
+// size_t kc, r2 -> r5
+// const uint8_t*restrict a, r3
+// size_t a_stride, sp + 96 -> (r7)
+// const void*restrict w, sp + 100 -> r9
+// uint8_t*restrict c, sp + 104 -> r11
+// size_t cm_stride, sp + 108 -> (r6)
+// size_t cn_stride, sp + 112 -> r7
+// const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) sp + 116 -> (r7)
+
+
+// inner loop registers
+
+// A0 r7 d0
+// A1 r6 d1
+// A2 r2 d2
+// A3 r3 d3
+
+// B r9 d8, d9, d10, d11
+// B d12, d13, d14, d15
+
+// C3 ip r12 [240]q9 q5
+// C2 sl r10 q15 q7
+// C1 fp r11 q2 q4
+// C0 r0 q6 q14
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64
+ .arm
+#ifndef __APPLE__
+ .arch armv7-a
+ .fpu neon
+#endif
+ # Push 104 bytes. r2 is for kc reset
+ VPUSH {d8-d15} // 64 bytes
+ PUSH {r2, r4, r5, r6, r7, r8, r9, sl, fp, lr} // 40 bytes
+ SUB sp, sp, #456 // +456 = 560 bytes. TODO eliminate
+ MOV lr, r1
+ LDR r1, [sp, #560]
+ CMP r0, #2
+ MOV r4, r3
+ ADDCS r4, r4, r1
+ CMP r0, #3
+ LDR r9, [sp, #580]
+ MOV r8, #15
+ MOV r6, r4
+ LDR ip, [sp, #568]
+ ADDCS r6, r6, r1
+ CMP r0, #4
+ LDR r5, [sp, #572]
+ MOV r7, r6
+ MOV sl, ip
+ ADDEQ r7, r7, r1
+ MOV r1, r9
+ VLD1.32 {d16-d17}, [r1], r8
+ CMP r0, #2
+ ADDCS sl, sl, r5
+ CMP r0, #3
+ VLD1.8 {d18-d19}, [r1]
+ ADD r1, r9, #4
+ MOV fp, sl
+ VLD1.32 {d20-d21}, [r1]
+ ADD r1, r9, #8
+ ADDCS fp, fp, r5
+ CMP r0, #4
+ VLD1.64 {d22-d23}, [r1]
+ ADD r1, r9, #12
+ MOV r0, fp
+ VLD1.32 {d24-d25}, [r1]
+ ADD r1, r9, #14
+ ADDEQ r0, r0, r5
+ MOV r9, #32
+ VLD1.16 {d26-d27}, [r1]
+ MOV r1, r2
+ MOV r2, r4
+ ADD r4, sp, #128
+ VDUP.32 q0, d16[0]
+ LDR r5, [sp, #576]
+ LDR r8, [sp, #564] // w
+ VDUP.8 q8, d26[0]
+ STR r1, [sp, #56]
+ VDUP.8 q1, d18[0]
+ VSTMIA r4, {d16-d17}
+ ADD r4, sp, #112
+ VDUP.16 q8, d24[0]
+ VSTMIA r4, {d16-d17}
+ ADD r4, sp, #96
+ VDUP.32 q8, d22[0]
+ VSTMIA r4, {d16-d17}
+ ADD r4, sp, #80
+ VDUP.32 q8, d20[0]
+ VSTMIA r4, {d16-d17}
+ ADD r4, sp, #32
+ VSTMIA r4, {d0-d1}
+ ADD r4, sp, #16
+ VSTMIA r4, {d2-d3}
+0:
+ # Load initial bias from w into accumulators
+ ADD r4, r8, #16
+ VLD1.8 {d16-d17}, [r8], r9 // Bias
+ CMP r1, #8
+ STR lr, [sp, #12]
+ ADD lr, sp, #240
+ VLD1.8 {d10-d11}, [r4]
+ VSTMIA lr, {d16-d17}
+ LDR lr, [sp, #12]
+ BCC 2f // less than 8 channels? skip main loop
+ STR lr, [sp, #64]
+ ADD lr, sp, #240
+ VORR q7, q5, q5
+ STR ip, [sp, #68]
+ VLDMIA lr, {d6-d7}
+ VORR q14, q5, q5
+ VORR q6, q5, q5
+ MOV ip, #0
+ VORR q15, q3, q3
+ MOV r4, r1
+ VORR q4, q3, q3
+ STR r0, [sp, #72]
+ STR fp, [sp, #76]
+ STR sl, [sp, #60]
+ STR r7, [sp, #156]
+
+ # Main loop - 8 bytes of A
+
+1:
+ MOV r9, r4
+ MOV r4, r7
+ LDR lr, [r8, #4]
+ MOV r7, r6
+ MOV r6, r2
+ LDR sl, [r4, ip]!
+ LDR fp, [r8]
+ LDR r2, [r8, #8]
+ LDR r1, [r8, #12]
+ STR lr, [sp, #364]
+ ADD lr, sp, #192
+ LDR r0, [r8, #24]
+ STR fp, [sp, #360]
+ LDR r5, [r8, #20]
+ STR r2, [sp, #384]
+ ADD r2, sp, #360
+ STR r1, [sp, #388]
+ VLD1.8 {d16}, [r2 :64]
+ ADD r2, sp, #384
+ VLD1.8 {d17}, [r2 :64]
+ VMOVL.S8 q10, d16
+ LDR r1, [r8, #28]
+ LDR r2, [r8, #16]
+ STR sl, [sp, #416]
+ STR r1, [sp, #380]
+ STR r0, [sp, #376]
+ STR r5, [sp, #372]
+ STR r2, [sp, #368]
+ LDR r0, [r4, #4]
+ MOV r4, r9
+ STR r0, [sp, #420]
+ ADD r0, sp, #416
+ SUB r4, r9, #8
+ VLD1.8 {d18}, [r0 :64]
+ ADD r0, sp, #376
+ CMP r4, #7
+ VMOVL.S8 q0, d18
+ VLD1.8 {d16}, [r0 :64]
+ VMOVL.S8 q9, d17
+ ADD r0, sp, #368
+ VMLAL.S16 q6, d21, d0[0]
+ VLD1.8 {d17}, [r0 :64]
+ VORR q11, q9, q9
+ LDR r0, [r8, #32]
+ VMLAL.S16 q3, d20, d0[0]
+ LDR r1, [r8, #36]
+ VMLAL.S16 q6, d19, d0[1]
+ LDR r2, [r8, #40]
+ VMOVL.S8 q9, d17
+ LDR r5, [r8, #44]
+ STR r0, [sp, #424]
+ ADD r0, sp, #424
+ STR r1, [sp, #428]
+ VMLAL.S16 q6, d19, d0[2]
+ VORR q1, q9, q9
+ VMOVL.S8 q9, d16
+ VMLAL.S16 q6, d19, d0[3]
+ VSTMIA lr, {d18-d19}
+ ADD lr, sp, #176
+ VORR q9, q11, q11
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #432
+ VMLAL.S16 q3, d18, d0[1]
+ VMOVL.S8 q8, d16
+ STR r2, [sp, #432]
+ STR r5, [sp, #436]
+ MOV r2, r6
+ MOV r6, r7
+ LDR r7, [sp, #156]
+ VMLAL.S16 q6, d17, d1[0]
+ LDR r1, [r8, #48]
+ VSTMIA lr, {d16-d17}
+ ADD lr, sp, #208
+ VLD1.8 {d16}, [r0 :64]
+ LDR r0, [r8, #52]
+ VMOVL.S8 q8, d16
+ STR r0, [sp, #444]
+ ADD r0, sp, #440
+ STR r1, [sp, #440]
+ VLD1.8 {d24}, [r0 :64]
+ VMLAL.S16 q6, d17, d1[1]
+ VORR q2, q8, q8
+ LDR r0, [r8, #60]
+ VMOVL.S8 q8, d24
+ LDR r1, [r8, #56]
+ ADD r8, r8, #64
+ STR r0, [sp, #452]
+ ADD r0, sp, #448
+ STR r1, [sp, #448]
+ VMLAL.S16 q6, d17, d1[2]
+ VLD1.8 {d26}, [r0 :64]
+ VORR q12, q8, q8
+ MOV r0, r6
+ VMOVL.S8 q13, d26
+ LDR r1, [r0, ip]!
+ VMLAL.S16 q6, d27, d1[3]
+ VSTMIA lr, {d12-d13}
+ VORR q6, q10, q10
+ VORR q10, q1, q1
+ ADD lr, sp, #256
+ VORR q1, q12, q12
+ VSTMIA lr, {d22-d23}
+ ADD lr, sp, #192
+ VMLAL.S16 q3, d20, d0[2]
+ VLDMIA lr, {d22-d23}
+ ADD lr, sp, #176
+ VLDMIA lr, {d16-d17}
+ ADD lr, sp, #160
+ VMLAL.S16 q3, d22, d0[3]
+ STR r1, [sp, #408]
+ LDR r0, [r0, #4]
+ STR r0, [sp, #412]
+ ADD r0, sp, #408
+ VMLAL.S16 q3, d16, d1[0]
+ VMLAL.S16 q3, d4, d1[1]
+ VMLAL.S16 q3, d24, d1[2]
+ VORR q12, q8, q8
+ VMLAL.S16 q3, d26, d1[3]
+ VLD1.8 {d0}, [r0 :64]
+ MOV r0, r2
+ VMOVL.S8 q0, d0
+ VSTMIA lr, {d4-d5}
+ ADD lr, sp, #256
+ LDR r1, [r0, ip]!
+ VMLAL.S16 q14, d13, d0[0]
+ VMLAL.S16 q4, d12, d0[0]
+ VMLAL.S16 q14, d19, d0[1]
+ VORR q9, q10, q10
+ VMLAL.S16 q14, d21, d0[2]
+ VORR q10, q6, q6
+ VMLAL.S16 q14, d23, d0[3]
+ VMLAL.S16 q14, d17, d1[0]
+ VORR q8, q2, q2
+ VLDMIA lr, {d4-d5}
+ ADD lr, sp, #160
+ VMLAL.S16 q4, d4, d0[1]
+ STR r1, [sp, #400]
+ VMLAL.S16 q14, d17, d1[1]
+ LDR r0, [r0, #4]
+ STR r0, [sp, #404]
+ ADD r0, sp, #400
+ VMLAL.S16 q4, d18, d0[2]
+ VMLAL.S16 q14, d3, d1[2]
+ VMLAL.S16 q4, d22, d0[3]
+ VMLAL.S16 q14, d27, d1[3]
+ VMLAL.S16 q4, d24, d1[0]
+ VMLAL.S16 q4, d16, d1[1]
+ VLDMIA lr, {d16-d17}
+ ADD lr, sp, #224
+ VMLAL.S16 q4, d2, d1[2]
+ VMLAL.S16 q4, d26, d1[3]
+ VLD1.8 {d0}, [r0 :64]
+ MOV r0, r3
+ VMOVL.S8 q0, d0
+ LDR r1, [r0, ip]!
+ ADD ip, ip, #8
+ VMLAL.S16 q7, d13, d0[0]
+ VMLAL.S16 q15, d20, d0[0]
+ VMLAL.S16 q7, d5, d0[1]
+ VORR q2, q9, q9
+ VMLAL.S16 q7, d19, d0[2]
+ VORR q9, q1, q1
+ VMLAL.S16 q7, d23, d0[3]
+ VMLAL.S16 q7, d25, d1[0]
+ VMLAL.S16 q7, d17, d1[1]
+ VMLAL.S16 q7, d3, d1[2]
+ VMLAL.S16 q7, d27, d1[3]
+ VSTMIA lr, {d14-d15}
+ ADD lr, sp, #240
+ STR r1, [sp, #392]
+ LDR r0, [r0, #4]
+ STR r0, [sp, #396]
+ ADD r0, sp, #392
+ VLDMIA lr, {d12-d13}
+ ADD lr, sp, #256
+ VLD1.8 {d2}, [r0 :64]
+ VMOVL.S8 q1, d2
+ VLDMIA lr, {d14-d15}
+ ADD lr, sp, #224
+ VMLAL.S16 q15, d14, d0[1]
+ VMLAL.S16 q6, d20, d2[0]
+ VMLAL.S16 q5, d21, d2[0]
+ VMLAL.S16 q15, d4, d0[2]
+ VMLAL.S16 q6, d14, d2[1]
+ VMLAL.S16 q5, d15, d2[1]
+ VLDMIA lr, {d14-d15}
+ ADD lr, sp, #240
+ VMLAL.S16 q15, d22, d0[3]
+ VMLAL.S16 q6, d4, d2[2]
+ VMLAL.S16 q5, d5, d2[2]
+ VMLAL.S16 q15, d24, d1[0]
+ VMLAL.S16 q6, d22, d2[3]
+ VMLAL.S16 q5, d23, d2[3]
+ VMLAL.S16 q15, d16, d1[1]
+ VMLAL.S16 q6, d24, d3[0]
+ VMLAL.S16 q5, d25, d3[0]
+ VMLAL.S16 q15, d18, d1[2]
+ VMLAL.S16 q6, d16, d3[1]
+ VMLAL.S16 q5, d17, d3[1]
+ VMLAL.S16 q15, d26, d1[3]
+ VMLAL.S16 q6, d18, d3[2]
+ VMLAL.S16 q5, d19, d3[2]
+ VMLAL.S16 q6, d26, d3[3]
+ VMLAL.S16 q5, d27, d3[3]
+ VSTMIA lr, {d12-d13}
+ ADD lr, sp, #208
+ VLDMIA lr, {d12-d13}
+ BHI 1b
+ ADD r5, sp, #32
+ ADD lr, sp, #56
+ VORR q2, q4, q4
+ ADD r7, r7, ip
+ VLDMIA r5, {d0-d1}
+ ADD r5, sp, #16
+ VORR q4, q14, q14
+ ADD r6, r6, ip
+ VLDMIA r5, {d2-d3}
+ ADD r2, r2, ip
+ ADD r3, r3, ip
+ VORR q14, q3, q3
+ LDR ip, [sp, #68]
+ MOV r9, #32
+ LDR fp, [sp, #76]
+ LDR r0, [sp, #72]
+ LDR r5, [sp, #576]
+ LDM lr, {r1, sl, lr}
+ B 3f
+2:
+ STR lr, [sp, #12]
+ ADD lr, sp, #240
+ VORR q15, q5, q5
+ MOV r4, r1
+ VLDMIA lr, {d16-d17}
+ VORR q6, q5, q5
+ VORR q4, q5, q5
+ LDR lr, [sp, #12]
+ VORR q14, q8, q8
+ VORR q2, q8, q8
+ VORR q7, q5, q5
+ VORR q15, q8, q8
+3:
+ CMP r4, #0
+ BNE 5f
+
+ # rndnu quantization
+ # C3 [240]q9 q5
+ # C2 q15 q7
+ # C1 q2 q4
+ # C0 q6 q14
+
+4:
+ ADD r4, sp, #80
+ VSHL.S32 q11, q2, q0
+ CMP lr, #7
+ VLDMIA r4, {d4-d5}
+
+ VSHL.S32 q13, q15, q0
+ VLDR q15, [sp, 240] // q15 spilled
+
+ VSHL.S32 q8, q6, q0
+ VSHL.S32 q9, q14, q0
+ VSHL.S32 q10, q4, q0
+ VSHL.S32 q12, q7, q0
+ VSHL.S32 q14, q5, q0
+ VSHL.S32 q15, q15, q0
+
+ VQDMULH.S32 q8, q8, q2
+ VQDMULH.S32 q9, q9, q2
+ VQDMULH.S32 q10, q10, q2
+ VQDMULH.S32 q12, q12, q2
+ VQDMULH.S32 q11, q11, q2
+ VQDMULH.S32 q13, q13, q2
+ VQDMULH.S32 q14, q14, q2
+ VQDMULH.S32 q15, q15, q2
+ VLDMIA r4, {d4-d5}
+ VRSHL.S32 q8, q8, q2
+ VRSHL.S32 q9, q9, q2
+ VRSHL.S32 q10, q10, q2
+ VRSHL.S32 q12, q12, q2
+ VRSHL.S32 q11, q11, q2
+ VRSHL.S32 q13, q13, q2
+ VRSHL.S32 q14, q14, q2
+ VRSHL.S32 q15, q15, q2
+ VQMOVN.S32 d17, q8
+ VQMOVN.S32 d16, q9
+ VQMOVN.S32 d19, q10
+ VQMOVN.S32 d21, q12
+ VLDMIA r4, {d24-d25}
+ VQMOVN.S32 d18, q11
+ VQMOVN.S32 d20, q13
+ VQMOVN.S32 d23, q14
+ VQMOVN.S32 d22, q15
+ VQADD.S16 q8, q8, q12
+ VQADD.S16 q9, q9, q12
+ VQADD.S16 q10, q10, q12
+ VQADD.S16 q11, q11, q12
+ VQMOVN.S16 d17, q8
+ VQMOVN.S16 d16, q9
+ VQMOVN.S16 d19, q10
+ VQMOVN.S16 d18, q11
+ VLDMIA r4, {d20-d21}
+ VMAX.S8 q8, q8, q10
+ VMAX.S8 q10, q9, q10
+ SUBS lr, lr, #8
+ VMIN.S8 q9, q8, q1
+ VMIN.S8 q11, q10, q1
+ BLS 9f
+
+ # Store full 4 x 8
+ VST1.8 {d22}, [ip], r5
+ SUB r7, r7, r1
+ VST1.8 {d23}, [sl], r5
+ SUB r6, r6, r1
+ VST1.8 {d18}, [fp], r5
+ SUB r2, r2, r1
+ VST1.8 {d19}, [r0], r5
+ SUB r3, r3, r1
+ BNE 0b
+
+ ADD sp, sp, #460 // skip over r2.
+ POP {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+ VPOP {d8-d15}
+ BX lr
+
+5:
+ STR r0, [sp, #72]
+ ADD r0, r8, #8
+ STR r0, [sp, #224]
+ MOV r5, r7
+ MOV r7, r6
+ LDR r0, [r8]
+ STR r0, [sp, #256]
+ MOV r6, r2
+ LDR r2, [r7]
+ MOV r9, r3
+ LDR r1, [r7, #4]
+ CMP r4, #1
+ LDR r0, [r5, #4]
+ STR fp, [sp, #76]
+ LDR fp, [r8, #4]
+ LDR r3, [r5]
+ STR r1, [sp, #340]
+ STR r2, [sp, #336]
+ STR r0, [sp, #348]
+ LDR r0, [r6]
+ LDR r2, [r6, #4]
+ STR r3, [sp, #344]
+ MOV r3, r9
+ STR fp, [sp, #356]
+ ADD r3, r4, r9
+ LDR r1, [sp, #256]
+ STR r1, [sp, #352]
+ STR r0, [sp, #328]
+ ADD r0, sp, #336
+ STR r2, [sp, #332]
+ MOV r2, r6
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #344
+ MOV r6, r7
+ MOV r7, r5
+ VLD1.8 {d17}, [r0 :64]
+ ADD r0, sp, #352
+ VMOVL.S8 q10, d16
+ ADD r7, r4, r5
+ VLD1.8 {d18}, [r0 :64]
+ ADD r0, sp, #328
+ VMOVL.S8 q3, d17
+ ADD r6, r4, r6
+ VLD1.8 {d17}, [r0 :64]
+ VMOVL.S8 q9, d18
+ VORR q11, q10, q10
+ ADD r2, r4, r2
+ VMOVL.S8 q8, d17
+ LDR r0, [r9]
+ LDR r1, [r9, #4]
+ VMLAL.S16 q6, d19, d6[0]
+ STR r0, [sp, #320]
+ VORR q12, q3, q3
+ VMLAL.S16 q14, d18, d6[0]
+ ADD r0, sp, #320
+ VORR q3, q10, q10
+ STR r1, [sp, #324]
+ VORR q10, q8, q8
+ VMLAL.S16 q4, d19, d6[0]
+ VMLAL.S16 q2, d18, d6[0]
+ VORR q3, q8, q8
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMLAL.S16 q7, d19, d6[0]
+ VMLAL.S16 q15, d18, d6[0]
+ VMOVL.S8 q3, d16
+ VLDMIA r0, {d16-d17}
+ ADD r0, sp, #240
+ VMLAL.S16 q8, d18, d6[0]
+ VMLAL.S16 q5, d19, d6[0]
+ VSTMIA r0, {d16-d17}
+ BNE 6f
+ LDR r8, [sp, #224]
+ MOV r9, #32
+ LDR fp, [sp, #76]
+ B 8f
+6:
+ LDR r5, [sp, #224]
+ VORR q13, q3, q3
+ VORR q3, q12, q12
+ CMP r4, #3
+ MOV r9, #32
+ LDR r0, [r5]
+ LDR r1, [r5, #4]
+ STR r0, [sp, #312]
+ ADD r0, sp, #312
+ STR r1, [sp, #316]
+ ADD r1, r5, #8
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMOVL.S8 q8, d16
+ VLDMIA r0, {d18-d19}
+ ADD r0, sp, #240
+ LDR fp, [sp, #76]
+ VMLAL.S16 q6, d17, d6[1]
+ VMLAL.S16 q14, d16, d6[1]
+ VORR q3, q11, q11
+ VMLAL.S16 q4, d17, d6[1]
+ VMLAL.S16 q2, d16, d6[1]
+ VORR q3, q10, q10
+ VMLAL.S16 q7, d17, d6[1]
+ VMLAL.S16 q15, d16, d6[1]
+ VORR q3, q13, q13
+ VMLAL.S16 q9, d16, d6[1]
+ VMLAL.S16 q5, d17, d6[1]
+ VSTMIA r0, {d18-d19}
+ BCC 7f
+ LDR r0, [r1]
+ VORR q3, q12, q12
+ LDR r1, [r1, #4]
+ CMP r4, #3
+ STR r0, [sp, #304]
+ ADD r0, sp, #304
+ STR r1, [sp, #308]
+ ADD r1, r5, #16
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMOVL.S8 q8, d16
+ VLDMIA r0, {d18-d19}
+ ADD r0, sp, #240
+ VMLAL.S16 q6, d17, d6[2]
+ VMLAL.S16 q14, d16, d6[2]
+ VORR q3, q11, q11
+ VMLAL.S16 q4, d17, d6[2]
+ VMLAL.S16 q2, d16, d6[2]
+ VORR q3, q10, q10
+ VMLAL.S16 q7, d17, d6[2]
+ VMLAL.S16 q15, d16, d6[2]
+ VORR q3, q13, q13
+ VMLAL.S16 q9, d16, d6[2]
+ VMLAL.S16 q5, d17, d6[2]
+ VSTMIA r0, {d18-d19}
+ BEQ 7f
+ LDR r0, [r1]
+ VORR q3, q12, q12
+ LDR r1, [r1, #4]
+ CMP r4, #5
+ STR r0, [sp, #296]
+ ADD r0, sp, #296
+ STR r1, [sp, #300]
+ ADD r1, r5, #24
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMOVL.S8 q8, d16
+ VLDMIA r0, {d18-d19}
+ ADD r0, sp, #240
+ VMLAL.S16 q6, d17, d6[3]
+ VMLAL.S16 q14, d16, d6[3]
+ VORR q3, q11, q11
+ VMLAL.S16 q4, d17, d6[3]
+ VMLAL.S16 q2, d16, d6[3]
+ VORR q3, q10, q10
+ VMLAL.S16 q7, d17, d6[3]
+ VMLAL.S16 q15, d16, d6[3]
+ VORR q3, q13, q13
+ VMLAL.S16 q9, d16, d6[3]
+ VMLAL.S16 q5, d17, d6[3]
+ VSTMIA r0, {d18-d19}
+ BCC 7f
+ LDR r0, [r1]
+ VORR q3, q12, q12
+ LDR r1, [r1, #4]
+ CMP r4, #5
+ STR r0, [sp, #288]
+ ADD r0, sp, #288
+ STR r1, [sp, #292]
+ ADD r1, r5, #32
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMOVL.S8 q8, d16
+ VLDMIA r0, {d18-d19}
+ ADD r0, sp, #240
+ VMLAL.S16 q6, d17, d7[0]
+ VMLAL.S16 q14, d16, d7[0]
+ VORR q3, q11, q11
+ VMLAL.S16 q4, d17, d7[0]
+ VMLAL.S16 q2, d16, d7[0]
+ VORR q3, q10, q10
+ VMLAL.S16 q7, d17, d7[0]
+ VMLAL.S16 q15, d16, d7[0]
+ VORR q3, q13, q13
+ VMLAL.S16 q9, d16, d7[0]
+ VMLAL.S16 q5, d17, d7[0]
+ VSTMIA r0, {d18-d19}
+ BEQ 7f
+ LDR r0, [r1]
+ VORR q3, q12, q12
+ LDR r1, [r1, #4]
+ CMP r4, #7
+ STR r0, [sp, #280]
+ ADD r0, sp, #280
+ STR r1, [sp, #284]
+ ADD r1, r5, #40
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMOVL.S8 q8, d16
+ VLDMIA r0, {d18-d19}
+ ADD r0, sp, #240
+ VMLAL.S16 q6, d17, d7[1]
+ VMLAL.S16 q14, d16, d7[1]
+ VORR q3, q11, q11
+ VMLAL.S16 q4, d17, d7[1]
+ VMLAL.S16 q2, d16, d7[1]
+ VORR q3, q10, q10
+ VMLAL.S16 q7, d17, d7[1]
+ VMLAL.S16 q15, d16, d7[1]
+ VORR q3, q13, q13
+ VMLAL.S16 q9, d16, d7[1]
+ VMLAL.S16 q5, d17, d7[1]
+ VSTMIA r0, {d18-d19}
+ BCC 7f
+ LDR r0, [r1]
+ VORR q3, q12, q12
+ LDR r1, [r1, #4]
+ ADD r8, r8, #56
+ STR r0, [sp, #272]
+ ADD r0, sp, #272
+ STR r1, [sp, #276]
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMOVL.S8 q8, d16
+ VLDMIA r0, {d18-d19}
+ ADD r0, sp, #240
+ VMLAL.S16 q6, d17, d7[2]
+ VMLAL.S16 q14, d16, d7[2]
+ VORR q3, q11, q11
+ VMLAL.S16 q4, d17, d7[2]
+ VMLAL.S16 q2, d16, d7[2]
+ VORR q3, q10, q10
+ VMLAL.S16 q7, d17, d7[2]
+ VMLAL.S16 q15, d16, d7[2]
+ VORR q3, q13, q13
+ VMLAL.S16 q9, d16, d7[2]
+ VMLAL.S16 q5, d17, d7[2]
+ VSTMIA r0, {d18-d19}
+ B 8f
+7:
+ MOV r8, r1
+8:
+ LDR r0, [sp, #72]
+ LDR r1, [sp, #56]
+ LDR r5, [sp, #576]
+ B 4b
+
+ # Store odd width
+9:
+ TST lr, #4
+ BEQ 10f
+ VST1.32 {d22[0]}, [ip]!
+ VST1.32 {d23[0]}, [sl]!
+ VST1.32 {d18[0]}, [fp]!
+ VST1.32 {d19[0]}, [r0]!
+ VEXT.8 q9, q9, q9, #4
+ VEXT.8 q11, q11, q11, #4
+10:
+ TST lr, #2
+ BEQ 11f
+ VST1.16 {d22[0]}, [ip]!
+ VST1.16 {d23[0]}, [sl]!
+ VST1.16 {d18[0]}, [fp]!
+ VST1.16 {d19[0]}, [r0]!
+ VEXT.8 q9, q9, q9, #2
+ VEXT.8 q11, q11, q11, #2
+11:
+ TST lr, #1
+ BEQ 12f
+ VST1.8 {d22[0]}, [ip]
+ VST1.8 {d23[0]}, [sl]
+ VST1.8 {d18[0]}, [fp]
+ VST1.8 {d19[0]}, [r0]
+12:
+ ADD sp, sp, #460 // skip over r2.
+ POP {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+ VPOP {d8-d15}
+ BX lr
+
+END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..0969ac9
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch32-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,747 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x8-aarch32-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+.syntax unified
+
+// void xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64(
+// size_t mr, r0
+// size_t nc, r1
+// size_t kc, r2 -> r5
+// const uint8_t*restrict a, r3
+// size_t a_stride, sp + 96 -> (r7)
+// const void*restrict w, sp + 100 -> r9
+// uint8_t*restrict c, sp + 104 -> r11
+// size_t cm_stride, sp + 108 -> (r6)
+// size_t cn_stride, sp + 112 -> r7
+// const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) sp + 116 -> (r7)
+
+
+// inner loop registers
+
+// A0 r7 d0
+// A1 r6 d1
+// A2 r2 d2
+// A3 r3 d3
+
+// B r9 d8, d9, d10, d11
+// B d12, d13, d14, d15
+
+// C3 ip r12 [240]q9 q5
+// C2 sl r10 q15 q7
+// C1 fp r11 q2 q4
+// C0 r0 q6 q14
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64
+ .arm
+#ifndef __APPLE__
+ .arch armv7-a
+ .fpu neon
+#endif
+ # Push 104 bytes. r2 is for kc reset
+ VPUSH {d8-d15} // 64 bytes
+ PUSH {r2, r4, r5, r6, r7, r8, r9, sl, fp, lr} // 40 bytes
+ SUB sp, sp, #456 // +456 = 560 bytes. TODO eliminate
+ MOV lr, r1
+ LDR r1, [sp, #560]
+ CMP r0, #2
+ MOV r4, r3
+ ADDCS r4, r4, r1
+ CMP r0, #3
+ LDR r9, [sp, #580]
+ MOV r8, #15
+ MOV r6, r4
+ LDR ip, [sp, #568]
+ ADDCS r6, r6, r1
+ CMP r0, #4
+ LDR r5, [sp, #572]
+ MOV r7, r6
+ MOV sl, ip
+ ADDEQ r7, r7, r1
+ MOV r1, r9
+ VLD1.32 {d16-d17}, [r1], r8
+ CMP r0, #2
+ ADDCS sl, sl, r5
+ CMP r0, #3
+ VLD1.8 {d18-d19}, [r1]
+ ADD r1, r9, #4
+ MOV fp, sl
+ VLD1.32 {d20-d21}, [r1]
+ ADD r1, r9, #8
+ ADDCS fp, fp, r5
+ CMP r0, #4
+ VLD1.64 {d22-d23}, [r1]
+ ADD r1, r9, #12
+ MOV r0, fp
+ VLD1.32 {d24-d25}, [r1]
+ ADD r1, r9, #14
+ ADDEQ r0, r0, r5
+ MOV r9, #32
+ VLD1.16 {d26-d27}, [r1]
+ MOV r1, r2
+ MOV r2, r4
+ ADD r4, sp, #128
+ VDUP.32 q0, d16[0]
+ LDR r5, [sp, #576]
+ LDR r8, [sp, #564] // w
+ VDUP.8 q8, d26[0]
+ STR r1, [sp, #56]
+ VDUP.8 q1, d18[0]
+ VSTMIA r4, {d16-d17}
+ ADD r4, sp, #112
+ VDUP.16 q8, d24[0]
+ VSTMIA r4, {d16-d17}
+ ADD r4, sp, #96
+ VDUP.32 q8, d22[0]
+ VSTMIA r4, {d16-d17}
+ ADD r4, sp, #80
+ VDUP.32 q8, d20[0]
+ VSTMIA r4, {d16-d17}
+ ADD r4, sp, #32
+ VSTMIA r4, {d0-d1}
+ ADD r4, sp, #16
+ VSTMIA r4, {d2-d3}
+0:
+ # Load initial bias from w into accumulators
+ ADD r4, r8, #16
+ VLD1.8 {d16-d17}, [r8], r9 // Bias
+ CMP r1, #8
+ STR lr, [sp, #12]
+ ADD lr, sp, #240
+ VLD1.8 {d10-d11}, [r4]
+ VSTMIA lr, {d16-d17}
+ LDR lr, [sp, #12]
+ BCC 2f // less than 8 channels? skip main loop
+ STR lr, [sp, #64]
+ ADD lr, sp, #240
+ VORR q7, q5, q5
+ STR ip, [sp, #68]
+ VLDMIA lr, {d6-d7}
+ VORR q14, q5, q5
+ VORR q6, q5, q5
+ MOV ip, #0
+ VORR q15, q3, q3
+ MOV r4, r1
+ VORR q4, q3, q3
+ STR r0, [sp, #72]
+ STR fp, [sp, #76]
+ STR sl, [sp, #60]
+ STR r7, [sp, #156]
+
+ # Main loop - 8 bytes of A
+
+1:
+ MOV r9, r4
+ MOV r4, r7
+ LDR lr, [r8, #4]
+ MOV r7, r6
+ MOV r6, r2
+ LDR sl, [r4, ip]!
+ LDR fp, [r8]
+ LDR r2, [r8, #8]
+ LDR r1, [r8, #12]
+ STR lr, [sp, #364]
+ ADD lr, sp, #192
+ LDR r0, [r8, #24]
+ STR fp, [sp, #360]
+ LDR r5, [r8, #20]
+ STR r2, [sp, #384]
+ ADD r2, sp, #360
+ STR r1, [sp, #388]
+ VLD1.8 {d16}, [r2 :64]
+ ADD r2, sp, #384
+ VLD1.8 {d17}, [r2 :64]
+ VMOVL.S8 q10, d16
+ LDR r1, [r8, #28]
+ LDR r2, [r8, #16]
+ STR sl, [sp, #416]
+ STR r1, [sp, #380]
+ STR r0, [sp, #376]
+ STR r5, [sp, #372]
+ STR r2, [sp, #368]
+ LDR r0, [r4, #4]
+ MOV r4, r9
+ STR r0, [sp, #420]
+ ADD r0, sp, #416
+ SUB r4, r9, #8
+ VLD1.8 {d18}, [r0 :64]
+ ADD r0, sp, #376
+ CMP r4, #7
+ VMOVL.S8 q0, d18
+ VLD1.8 {d16}, [r0 :64]
+ VMOVL.S8 q9, d17
+ ADD r0, sp, #368
+ VMLAL.S16 q6, d21, d0[0]
+ VLD1.8 {d17}, [r0 :64]
+ VORR q11, q9, q9
+ PLD [r8, #480]
+ LDR r0, [r8, #32]
+ VMLAL.S16 q3, d20, d0[0]
+ LDR r1, [r8, #36]
+ VMLAL.S16 q6, d19, d0[1]
+ LDR r2, [r8, #40]
+ VMOVL.S8 q9, d17
+ LDR r5, [r8, #44]
+ STR r0, [sp, #424]
+ ADD r0, sp, #424
+ STR r1, [sp, #428]
+ VMLAL.S16 q6, d19, d0[2]
+ VORR q1, q9, q9
+ VMOVL.S8 q9, d16
+ VMLAL.S16 q6, d19, d0[3]
+ VSTMIA lr, {d18-d19}
+ ADD lr, sp, #176
+ VORR q9, q11, q11
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #432
+ VMLAL.S16 q3, d18, d0[1]
+ VMOVL.S8 q8, d16
+ STR r2, [sp, #432]
+ STR r5, [sp, #436]
+ MOV r2, r6
+ MOV r6, r7
+ LDR r7, [sp, #156]
+ VMLAL.S16 q6, d17, d1[0]
+ LDR r1, [r8, #48]
+ VSTMIA lr, {d16-d17}
+ ADD lr, sp, #208
+ VLD1.8 {d16}, [r0 :64]
+ LDR r0, [r8, #52]
+ VMOVL.S8 q8, d16
+ STR r0, [sp, #444]
+ ADD r0, sp, #440
+ STR r1, [sp, #440]
+ VLD1.8 {d24}, [r0 :64]
+ VMLAL.S16 q6, d17, d1[1]
+ VORR q2, q8, q8
+ LDR r0, [r8, #60]
+ VMOVL.S8 q8, d24
+ LDR r1, [r8, #56]
+ ADD r8, r8, #64
+ STR r0, [sp, #452]
+ ADD r0, sp, #448
+ STR r1, [sp, #448]
+ VMLAL.S16 q6, d17, d1[2]
+ VLD1.8 {d26}, [r0 :64]
+ VORR q12, q8, q8
+ MOV r0, r6
+ VMOVL.S8 q13, d26
+ LDR r1, [r0, ip]!
+ VMLAL.S16 q6, d27, d1[3]
+ VSTMIA lr, {d12-d13}
+ VORR q6, q10, q10
+ VORR q10, q1, q1
+ ADD lr, sp, #256
+ VORR q1, q12, q12
+ VSTMIA lr, {d22-d23}
+ ADD lr, sp, #192
+ VMLAL.S16 q3, d20, d0[2]
+ VLDMIA lr, {d22-d23}
+ ADD lr, sp, #176
+ VLDMIA lr, {d16-d17}
+ ADD lr, sp, #160
+ VMLAL.S16 q3, d22, d0[3]
+ STR r1, [sp, #408]
+ LDR r0, [r0, #4]
+ STR r0, [sp, #412]
+ ADD r0, sp, #408
+ VMLAL.S16 q3, d16, d1[0]
+ VMLAL.S16 q3, d4, d1[1]
+ VMLAL.S16 q3, d24, d1[2]
+ VORR q12, q8, q8
+ VMLAL.S16 q3, d26, d1[3]
+ VLD1.8 {d0}, [r0 :64]
+ MOV r0, r2
+ VMOVL.S8 q0, d0
+ VSTMIA lr, {d4-d5}
+ ADD lr, sp, #256
+ LDR r1, [r0, ip]!
+ VMLAL.S16 q14, d13, d0[0]
+ VMLAL.S16 q4, d12, d0[0]
+ VMLAL.S16 q14, d19, d0[1]
+ VORR q9, q10, q10
+ VMLAL.S16 q14, d21, d0[2]
+ VORR q10, q6, q6
+ VMLAL.S16 q14, d23, d0[3]
+ VMLAL.S16 q14, d17, d1[0]
+ VORR q8, q2, q2
+ VLDMIA lr, {d4-d5}
+ ADD lr, sp, #160
+ VMLAL.S16 q4, d4, d0[1]
+ STR r1, [sp, #400]
+ VMLAL.S16 q14, d17, d1[1]
+ LDR r0, [r0, #4]
+ STR r0, [sp, #404]
+ ADD r0, sp, #400
+ VMLAL.S16 q4, d18, d0[2]
+ VMLAL.S16 q14, d3, d1[2]
+ VMLAL.S16 q4, d22, d0[3]
+ VMLAL.S16 q14, d27, d1[3]
+ VMLAL.S16 q4, d24, d1[0]
+ VMLAL.S16 q4, d16, d1[1]
+ VLDMIA lr, {d16-d17}
+ ADD lr, sp, #224
+ VMLAL.S16 q4, d2, d1[2]
+ VMLAL.S16 q4, d26, d1[3]
+ VLD1.8 {d0}, [r0 :64]
+ MOV r0, r3
+ VMOVL.S8 q0, d0
+ LDR r1, [r0, ip]!
+ ADD ip, ip, #8
+ VMLAL.S16 q7, d13, d0[0]
+ VMLAL.S16 q15, d20, d0[0]
+ VMLAL.S16 q7, d5, d0[1]
+ VORR q2, q9, q9
+ VMLAL.S16 q7, d19, d0[2]
+ VORR q9, q1, q1
+ VMLAL.S16 q7, d23, d0[3]
+ VMLAL.S16 q7, d25, d1[0]
+ VMLAL.S16 q7, d17, d1[1]
+ VMLAL.S16 q7, d3, d1[2]
+ VMLAL.S16 q7, d27, d1[3]
+ VSTMIA lr, {d14-d15}
+ ADD lr, sp, #240
+ STR r1, [sp, #392]
+ LDR r0, [r0, #4]
+ STR r0, [sp, #396]
+ ADD r0, sp, #392
+ VLDMIA lr, {d12-d13}
+ ADD lr, sp, #256
+ VLD1.8 {d2}, [r0 :64]
+ VMOVL.S8 q1, d2
+ VLDMIA lr, {d14-d15}
+ ADD lr, sp, #224
+ VMLAL.S16 q15, d14, d0[1]
+ VMLAL.S16 q6, d20, d2[0]
+ VMLAL.S16 q5, d21, d2[0]
+ VMLAL.S16 q15, d4, d0[2]
+ VMLAL.S16 q6, d14, d2[1]
+ VMLAL.S16 q5, d15, d2[1]
+ VLDMIA lr, {d14-d15}
+ ADD lr, sp, #240
+ VMLAL.S16 q15, d22, d0[3]
+ VMLAL.S16 q6, d4, d2[2]
+ VMLAL.S16 q5, d5, d2[2]
+ VMLAL.S16 q15, d24, d1[0]
+ VMLAL.S16 q6, d22, d2[3]
+ VMLAL.S16 q5, d23, d2[3]
+ VMLAL.S16 q15, d16, d1[1]
+ VMLAL.S16 q6, d24, d3[0]
+ VMLAL.S16 q5, d25, d3[0]
+ VMLAL.S16 q15, d18, d1[2]
+ VMLAL.S16 q6, d16, d3[1]
+ VMLAL.S16 q5, d17, d3[1]
+ VMLAL.S16 q15, d26, d1[3]
+ VMLAL.S16 q6, d18, d3[2]
+ VMLAL.S16 q5, d19, d3[2]
+ VMLAL.S16 q6, d26, d3[3]
+ VMLAL.S16 q5, d27, d3[3]
+ VSTMIA lr, {d12-d13}
+ ADD lr, sp, #208
+ VLDMIA lr, {d12-d13}
+ BHI 1b
+ ADD r5, sp, #32
+ ADD lr, sp, #56
+ VORR q2, q4, q4
+ ADD r7, r7, ip
+ VLDMIA r5, {d0-d1}
+ ADD r5, sp, #16
+ VORR q4, q14, q14
+ ADD r6, r6, ip
+ VLDMIA r5, {d2-d3}
+ ADD r2, r2, ip
+ ADD r3, r3, ip
+ VORR q14, q3, q3
+ LDR ip, [sp, #68]
+ MOV r9, #32
+ LDR fp, [sp, #76]
+ LDR r0, [sp, #72]
+ LDR r5, [sp, #576]
+ LDM lr, {r1, sl, lr}
+ B 3f
+2:
+ STR lr, [sp, #12]
+ ADD lr, sp, #240
+ VORR q15, q5, q5
+ MOV r4, r1
+ VLDMIA lr, {d16-d17}
+ VORR q6, q5, q5
+ VORR q4, q5, q5
+ LDR lr, [sp, #12]
+ VORR q14, q8, q8
+ VORR q2, q8, q8
+ VORR q7, q5, q5
+ VORR q15, q8, q8
+3:
+ CMP r4, #0
+ BNE 5f
+
+ # rndnu quantization
+ # C3 [240]q9 q5
+ # C2 q15 q7
+ # C1 q2 q4
+ # C0 q6 q14
+
+4:
+ ADD r4, sp, #80
+ VSHL.S32 q11, q2, q0
+ CMP lr, #7
+ VLDMIA r4, {d4-d5}
+
+ VSHL.S32 q13, q15, q0
+ VLDR q15, [sp, 240] // q15 spilled
+
+ VSHL.S32 q8, q6, q0
+ VSHL.S32 q9, q14, q0
+ VSHL.S32 q10, q4, q0
+ VSHL.S32 q12, q7, q0
+ VSHL.S32 q14, q5, q0
+ VSHL.S32 q15, q15, q0
+
+ VQDMULH.S32 q8, q8, q2
+ VQDMULH.S32 q9, q9, q2
+ VQDMULH.S32 q10, q10, q2
+ VQDMULH.S32 q12, q12, q2
+ VQDMULH.S32 q11, q11, q2
+ VQDMULH.S32 q13, q13, q2
+ VQDMULH.S32 q14, q14, q2
+ VQDMULH.S32 q15, q15, q2
+ VLDMIA r4, {d4-d5}
+ VRSHL.S32 q8, q8, q2
+ VRSHL.S32 q9, q9, q2
+ VRSHL.S32 q10, q10, q2
+ VRSHL.S32 q12, q12, q2
+ VRSHL.S32 q11, q11, q2
+ VRSHL.S32 q13, q13, q2
+ VRSHL.S32 q14, q14, q2
+ VRSHL.S32 q15, q15, q2
+ VQMOVN.S32 d17, q8
+ VQMOVN.S32 d16, q9
+ VQMOVN.S32 d19, q10
+ VQMOVN.S32 d21, q12
+ VLDMIA r4, {d24-d25}
+ VQMOVN.S32 d18, q11
+ VQMOVN.S32 d20, q13
+ VQMOVN.S32 d23, q14
+ VQMOVN.S32 d22, q15
+ VQADD.S16 q8, q8, q12
+ VQADD.S16 q9, q9, q12
+ VQADD.S16 q10, q10, q12
+ VQADD.S16 q11, q11, q12
+ VQMOVN.S16 d17, q8
+ VQMOVN.S16 d16, q9
+ VQMOVN.S16 d19, q10
+ VQMOVN.S16 d18, q11
+ VLDMIA r4, {d20-d21}
+ VMAX.S8 q8, q8, q10
+ VMAX.S8 q10, q9, q10
+ SUBS lr, lr, #8
+ VMIN.S8 q9, q8, q1
+ VMIN.S8 q11, q10, q1
+ BLS 9f
+
+ # Store full 4 x 8
+ VST1.8 {d22}, [ip], r5
+ SUB r7, r7, r1
+ VST1.8 {d23}, [sl], r5
+ SUB r6, r6, r1
+ VST1.8 {d18}, [fp], r5
+ SUB r2, r2, r1
+ VST1.8 {d19}, [r0], r5
+ SUB r3, r3, r1
+ BNE 0b
+
+ ADD sp, sp, #460 // skip over r2.
+ POP {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+ VPOP {d8-d15}
+ BX lr
+
+5:
+ STR r0, [sp, #72]
+ ADD r0, r8, #8
+ STR r0, [sp, #224]
+ MOV r5, r7
+ MOV r7, r6
+ LDR r0, [r8]
+ STR r0, [sp, #256]
+ MOV r6, r2
+ LDR r2, [r7]
+ MOV r9, r3
+ LDR r1, [r7, #4]
+ CMP r4, #1
+ LDR r0, [r5, #4]
+ STR fp, [sp, #76]
+ LDR fp, [r8, #4]
+ LDR r3, [r5]
+ STR r1, [sp, #340]
+ STR r2, [sp, #336]
+ STR r0, [sp, #348]
+ LDR r0, [r6]
+ LDR r2, [r6, #4]
+ STR r3, [sp, #344]
+ MOV r3, r9
+ STR fp, [sp, #356]
+ ADD r3, r4, r9
+ LDR r1, [sp, #256]
+ STR r1, [sp, #352]
+ STR r0, [sp, #328]
+ ADD r0, sp, #336
+ STR r2, [sp, #332]
+ MOV r2, r6
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #344
+ MOV r6, r7
+ MOV r7, r5
+ VLD1.8 {d17}, [r0 :64]
+ ADD r0, sp, #352
+ VMOVL.S8 q10, d16
+ ADD r7, r4, r5
+ VLD1.8 {d18}, [r0 :64]
+ ADD r0, sp, #328
+ VMOVL.S8 q3, d17
+ ADD r6, r4, r6
+ VLD1.8 {d17}, [r0 :64]
+ VMOVL.S8 q9, d18
+ VORR q11, q10, q10
+ ADD r2, r4, r2
+ VMOVL.S8 q8, d17
+ LDR r0, [r9]
+ LDR r1, [r9, #4]
+ VMLAL.S16 q6, d19, d6[0]
+ STR r0, [sp, #320]
+ VORR q12, q3, q3
+ VMLAL.S16 q14, d18, d6[0]
+ ADD r0, sp, #320
+ VORR q3, q10, q10
+ STR r1, [sp, #324]
+ VORR q10, q8, q8
+ VMLAL.S16 q4, d19, d6[0]
+ VMLAL.S16 q2, d18, d6[0]
+ VORR q3, q8, q8
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMLAL.S16 q7, d19, d6[0]
+ VMLAL.S16 q15, d18, d6[0]
+ VMOVL.S8 q3, d16
+ VLDMIA r0, {d16-d17}
+ ADD r0, sp, #240
+ VMLAL.S16 q8, d18, d6[0]
+ VMLAL.S16 q5, d19, d6[0]
+ VSTMIA r0, {d16-d17}
+ BNE 6f
+ LDR r8, [sp, #224]
+ MOV r9, #32
+ LDR fp, [sp, #76]
+ B 8f
+6:
+ LDR r5, [sp, #224]
+ VORR q13, q3, q3
+ VORR q3, q12, q12
+ CMP r4, #3
+ MOV r9, #32
+ LDR r0, [r5]
+ LDR r1, [r5, #4]
+ STR r0, [sp, #312]
+ ADD r0, sp, #312
+ STR r1, [sp, #316]
+ ADD r1, r5, #8
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMOVL.S8 q8, d16
+ VLDMIA r0, {d18-d19}
+ ADD r0, sp, #240
+ LDR fp, [sp, #76]
+ VMLAL.S16 q6, d17, d6[1]
+ VMLAL.S16 q14, d16, d6[1]
+ VORR q3, q11, q11
+ VMLAL.S16 q4, d17, d6[1]
+ VMLAL.S16 q2, d16, d6[1]
+ VORR q3, q10, q10
+ VMLAL.S16 q7, d17, d6[1]
+ VMLAL.S16 q15, d16, d6[1]
+ VORR q3, q13, q13
+ VMLAL.S16 q9, d16, d6[1]
+ VMLAL.S16 q5, d17, d6[1]
+ VSTMIA r0, {d18-d19}
+ BCC 7f
+ LDR r0, [r1]
+ VORR q3, q12, q12
+ LDR r1, [r1, #4]
+ CMP r4, #3
+ STR r0, [sp, #304]
+ ADD r0, sp, #304
+ STR r1, [sp, #308]
+ ADD r1, r5, #16
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMOVL.S8 q8, d16
+ VLDMIA r0, {d18-d19}
+ ADD r0, sp, #240
+ VMLAL.S16 q6, d17, d6[2]
+ VMLAL.S16 q14, d16, d6[2]
+ VORR q3, q11, q11
+ VMLAL.S16 q4, d17, d6[2]
+ VMLAL.S16 q2, d16, d6[2]
+ VORR q3, q10, q10
+ VMLAL.S16 q7, d17, d6[2]
+ VMLAL.S16 q15, d16, d6[2]
+ VORR q3, q13, q13
+ VMLAL.S16 q9, d16, d6[2]
+ VMLAL.S16 q5, d17, d6[2]
+ VSTMIA r0, {d18-d19}
+ BEQ 7f
+ LDR r0, [r1]
+ VORR q3, q12, q12
+ LDR r1, [r1, #4]
+ CMP r4, #5
+ STR r0, [sp, #296]
+ ADD r0, sp, #296
+ STR r1, [sp, #300]
+ ADD r1, r5, #24
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMOVL.S8 q8, d16
+ VLDMIA r0, {d18-d19}
+ ADD r0, sp, #240
+ VMLAL.S16 q6, d17, d6[3]
+ VMLAL.S16 q14, d16, d6[3]
+ VORR q3, q11, q11
+ VMLAL.S16 q4, d17, d6[3]
+ VMLAL.S16 q2, d16, d6[3]
+ VORR q3, q10, q10
+ VMLAL.S16 q7, d17, d6[3]
+ VMLAL.S16 q15, d16, d6[3]
+ VORR q3, q13, q13
+ VMLAL.S16 q9, d16, d6[3]
+ VMLAL.S16 q5, d17, d6[3]
+ VSTMIA r0, {d18-d19}
+ BCC 7f
+ LDR r0, [r1]
+ VORR q3, q12, q12
+ LDR r1, [r1, #4]
+ CMP r4, #5
+ STR r0, [sp, #288]
+ ADD r0, sp, #288
+ STR r1, [sp, #292]
+ ADD r1, r5, #32
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMOVL.S8 q8, d16
+ VLDMIA r0, {d18-d19}
+ ADD r0, sp, #240
+ VMLAL.S16 q6, d17, d7[0]
+ VMLAL.S16 q14, d16, d7[0]
+ VORR q3, q11, q11
+ VMLAL.S16 q4, d17, d7[0]
+ VMLAL.S16 q2, d16, d7[0]
+ VORR q3, q10, q10
+ VMLAL.S16 q7, d17, d7[0]
+ VMLAL.S16 q15, d16, d7[0]
+ VORR q3, q13, q13
+ VMLAL.S16 q9, d16, d7[0]
+ VMLAL.S16 q5, d17, d7[0]
+ VSTMIA r0, {d18-d19}
+ BEQ 7f
+ LDR r0, [r1]
+ VORR q3, q12, q12
+ LDR r1, [r1, #4]
+ CMP r4, #7
+ STR r0, [sp, #280]
+ ADD r0, sp, #280
+ STR r1, [sp, #284]
+ ADD r1, r5, #40
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMOVL.S8 q8, d16
+ VLDMIA r0, {d18-d19}
+ ADD r0, sp, #240
+ VMLAL.S16 q6, d17, d7[1]
+ VMLAL.S16 q14, d16, d7[1]
+ VORR q3, q11, q11
+ VMLAL.S16 q4, d17, d7[1]
+ VMLAL.S16 q2, d16, d7[1]
+ VORR q3, q10, q10
+ VMLAL.S16 q7, d17, d7[1]
+ VMLAL.S16 q15, d16, d7[1]
+ VORR q3, q13, q13
+ VMLAL.S16 q9, d16, d7[1]
+ VMLAL.S16 q5, d17, d7[1]
+ VSTMIA r0, {d18-d19}
+ BCC 7f
+ LDR r0, [r1]
+ VORR q3, q12, q12
+ LDR r1, [r1, #4]
+ ADD r8, r8, #56
+ STR r0, [sp, #272]
+ ADD r0, sp, #272
+ STR r1, [sp, #276]
+ VLD1.8 {d16}, [r0 :64]
+ ADD r0, sp, #240
+ VMOVL.S8 q8, d16
+ VLDMIA r0, {d18-d19}
+ ADD r0, sp, #240
+ VMLAL.S16 q6, d17, d7[2]
+ VMLAL.S16 q14, d16, d7[2]
+ VORR q3, q11, q11
+ VMLAL.S16 q4, d17, d7[2]
+ VMLAL.S16 q2, d16, d7[2]
+ VORR q3, q10, q10
+ VMLAL.S16 q7, d17, d7[2]
+ VMLAL.S16 q15, d16, d7[2]
+ VORR q3, q13, q13
+ VMLAL.S16 q9, d16, d7[2]
+ VMLAL.S16 q5, d17, d7[2]
+ VSTMIA r0, {d18-d19}
+ B 8f
+7:
+ MOV r8, r1
+8:
+ LDR r0, [sp, #72]
+ LDR r1, [sp, #56]
+ LDR r5, [sp, #576]
+ B 4b
+
+ # Store odd width
+9:
+ TST lr, #4
+ BEQ 10f
+ VST1.32 {d22[0]}, [ip]!
+ VST1.32 {d23[0]}, [sl]!
+ VST1.32 {d18[0]}, [fp]!
+ VST1.32 {d19[0]}, [r0]!
+ VEXT.8 q9, q9, q9, #4
+ VEXT.8 q11, q11, q11, #4
+10:
+ TST lr, #2
+ BEQ 11f
+ VST1.16 {d22[0]}, [ip]!
+ VST1.16 {d23[0]}, [sl]!
+ VST1.16 {d18[0]}, [fp]!
+ VST1.16 {d19[0]}, [r0]!
+ VEXT.8 q9, q9, q9, #2
+ VEXT.8 q11, q11, q11, #2
+11:
+ TST lr, #1
+ BEQ 12f
+ VST1.8 {d22[0]}, [ip]
+ VST1.8 {d23[0]}, [sl]
+ VST1.8 {d18[0]}, [fp]
+ VST1.8 {d19[0]}, [r0]
+12:
+ ADD sp, sp, #460 // skip over r2.
+ POP {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+ VPOP {d8-d15}
+ BX lr
+
+END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..1a4555e
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,391 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v5
+# C0 x6 v24 v28
+# C1 x8 v25 v29
+# C2 x9 v26 v30
+# C3 x7 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x12, x11, [sp] // Load cn_stride, params
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q24, q28, [x5], 32
+ SUBS x0, x2, 8 // k = kc - 8
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ # Is there at least 8 bytes for main loop?
+ B.LO 3f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+1:
+ LD1 {v0.8b}, [x3], 8
+ LDR d5, [x5], 8
+ LD1 {v1.8b}, [x15], 8
+ LD1 {v2.8b}, [x13], 8
+ LD1 {v3.8b}, [x4], 8
+ SXTL v0.8h, v0.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 1b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 3f
+
+2:
+ # Apply params - preshift, scale, postshift, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SQSHL v24.4s, v24.4s, v4.4s // shift to upper bits
+ SQSHL v25.4s, v25.4s, v4.4s
+ SQSHL v26.4s, v26.4s, v4.4s
+ SQSHL v27.4s, v27.4s, v4.4s
+ LD1R {v5.4s}, [x11], 4
+ SQSHL v28.4s, v28.4s, v4.4s
+ SQSHL v29.4s, v29.4s, v4.4s
+ SQSHL v30.4s, v30.4s, v4.4s
+ SQSHL v31.4s, v31.4s, v4.4s
+ LD1R {v6.4s}, [x11], 4
+ SQDMULH v24.4s, v24.4s, v5.4s // scale without rounding
+ SQDMULH v25.4s, v25.4s, v5.4s
+ SQDMULH v26.4s, v26.4s, v5.4s
+ SQDMULH v27.4s, v27.4s, v5.4s
+ SQDMULH v28.4s, v28.4s, v5.4s
+ SQDMULH v29.4s, v29.4s, v5.4s
+ SQDMULH v30.4s, v30.4s, v5.4s
+ SQDMULH v31.4s, v31.4s, v5.4s
+ SRSHL v24.4s, v24.4s, v6.4s // signed rounding shift left
+ SRSHL v25.4s, v25.4s, v6.4s
+ SRSHL v26.4s, v26.4s, v6.4s
+ SRSHL v27.4s, v27.4s, v6.4s
+ SRSHL v28.4s, v28.4s, v6.4s
+ SRSHL v29.4s, v29.4s, v6.4s
+ SRSHL v30.4s, v30.4s, v6.4s
+ SRSHL v31.4s, v31.4s, v6.4s
+
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+ LD1R {v4.8b}, [x11], 1 // clamp min value
+
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v5.8b}, [x11] // clamp max value
+ SQXTN v0.8b, v24.8h
+ SQXTN v1.8b, v25.8h
+ SQXTN v2.8b, v26.8h
+ SQXTN v3.8b, v27.8h
+ SUB x11, x11, 15 // rewind params pointer
+
+ SMAX v0.8b, v0.8b, v4.8b
+ SMAX v1.8b, v1.8b, v4.8b
+ SMAX v2.8b, v2.8b, v4.8b
+ SMAX v3.8b, v3.8b, v4.8b
+ SUBS x1, x1, 8
+ SMIN v0.8b, v0.8b, v5.8b
+ SMIN v1.8b, v1.8b, v5.8b
+ SMIN v2.8b, v2.8b, v5.8b
+ SMIN v3.8b, v3.8b, v5.8b
+ B.LO 4f
+
+ # Store full 4 x 8
+ ST1 {v0.8b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v1.8b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v2.8b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v3.8b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+3:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x3], x0
+ LDR d5, [x5], 8
+ LD1 {v1.8b}, [x15], x0
+ LD1 {v2.8b}, [x13], x0
+ LD1 {v3.8b}, [x4], x0
+ SXTL v0.8h, v0.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 2b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 2b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 2b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 2b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 2b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 2b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+4:
+ TBZ x1, 2, 5f
+ STR s0, [x6], 4
+ STR s1, [x8], 4
+ DUP s0, v0.s[1]
+ DUP s1, v1.s[1]
+ STR s2, [x9], 4
+ STR s3, [x7], 4
+ DUP s2, v2.s[1]
+ DUP s3, v3.s[1]
+5:
+ TBZ x1, 1, 6f
+ STR h0, [x6], 2
+ STR h1, [x8], 2
+ DUP h0, v0.h[1]
+ DUP h1, v1.h[1]
+ STR h2, [x9], 2
+ STR h3, [x7], 2
+ DUP h2, v2.h[1]
+ DUP h3, v3.h[1]
+6:
+ TBZ x1, 0, 7f
+ STR b0, [x6]
+ STR b1, [x8]
+ STR b2, [x9]
+ STR b3, [x7]
+7:
+ RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..c1bc2f8
--- /dev/null
+++ b/src/qs8-gemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,397 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v5
+# C0 x6 v24 v28
+# C1 x8 v25 v29
+# C2 x9 v26 v30
+# C3 x7 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x12, x11, [sp] // Load cn_stride, params
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q24, q28, [x5], 32
+ SUBS x0, x2, 8 // k = kc - 8
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ # Is there at least 8 bytes for main loop?
+ B.LO 3f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+1:
+ LD1 {v0.8b}, [x3], 8
+ LDR d5, [x5], 8
+ LD1 {v1.8b}, [x15], 8
+ LD1 {v2.8b}, [x13], 8
+ LD1 {v3.8b}, [x4], 8
+ SXTL v0.8h, v0.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ PRFM PLDL1KEEP, [x13, 128]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x15, 128]
+ PRFM PLDL1KEEP, [x3, 128]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x4, 128]
+ PRFM PLDL1KEEP, [x5, 448]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 512]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 1b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 3f
+
+2:
+ # Apply params - preshift, scale, postshift, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SQSHL v24.4s, v24.4s, v4.4s // shift to upper bits
+ SQSHL v25.4s, v25.4s, v4.4s
+ SQSHL v26.4s, v26.4s, v4.4s
+ SQSHL v27.4s, v27.4s, v4.4s
+ LD1R {v5.4s}, [x11], 4
+ SQSHL v28.4s, v28.4s, v4.4s
+ SQSHL v29.4s, v29.4s, v4.4s
+ SQSHL v30.4s, v30.4s, v4.4s
+ SQSHL v31.4s, v31.4s, v4.4s
+ LD1R {v6.4s}, [x11], 4
+ SQDMULH v24.4s, v24.4s, v5.4s // scale without rounding
+ SQDMULH v25.4s, v25.4s, v5.4s
+ SQDMULH v26.4s, v26.4s, v5.4s
+ SQDMULH v27.4s, v27.4s, v5.4s
+ SQDMULH v28.4s, v28.4s, v5.4s
+ SQDMULH v29.4s, v29.4s, v5.4s
+ SQDMULH v30.4s, v30.4s, v5.4s
+ SQDMULH v31.4s, v31.4s, v5.4s
+ SRSHL v24.4s, v24.4s, v6.4s // signed rounding shift left
+ SRSHL v25.4s, v25.4s, v6.4s
+ SRSHL v26.4s, v26.4s, v6.4s
+ SRSHL v27.4s, v27.4s, v6.4s
+ SRSHL v28.4s, v28.4s, v6.4s
+ SRSHL v29.4s, v29.4s, v6.4s
+ SRSHL v30.4s, v30.4s, v6.4s
+ SRSHL v31.4s, v31.4s, v6.4s
+
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+ LD1R {v4.8b}, [x11], 1 // clamp min value
+
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v5.8b}, [x11] // clamp max value
+ SQXTN v0.8b, v24.8h
+ SQXTN v1.8b, v25.8h
+ SQXTN v2.8b, v26.8h
+ SQXTN v3.8b, v27.8h
+ SUB x11, x11, 15 // rewind params pointer
+
+ SMAX v0.8b, v0.8b, v4.8b
+ SMAX v1.8b, v1.8b, v4.8b
+ SMAX v2.8b, v2.8b, v4.8b
+ SMAX v3.8b, v3.8b, v4.8b
+ SUBS x1, x1, 8
+ SMIN v0.8b, v0.8b, v5.8b
+ SMIN v1.8b, v1.8b, v5.8b
+ SMIN v2.8b, v2.8b, v5.8b
+ SMIN v3.8b, v3.8b, v5.8b
+ B.LO 4f
+
+ # Store full 4 x 8
+ ST1 {v0.8b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v1.8b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v2.8b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v3.8b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+3:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x3], x0
+ LDR d5, [x5], 8
+ LD1 {v1.8b}, [x15], x0
+ LD1 {v2.8b}, [x13], x0
+ LD1 {v3.8b}, [x4], x0
+ SXTL v0.8h, v0.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 2b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 2b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 2b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 2b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 2b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 2b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+4:
+ TBZ x1, 2, 5f
+ STR s0, [x6], 4
+ STR s1, [x8], 4
+ DUP s0, v0.s[1]
+ DUP s1, v1.s[1]
+ STR s2, [x9], 4
+ STR s3, [x7], 4
+ DUP s2, v2.s[1]
+ DUP s3, v3.s[1]
+5:
+ TBZ x1, 1, 6f
+ STR h0, [x6], 2
+ STR h1, [x8], 2
+ DUP h0, v0.h[1]
+ DUP h1, v1.h[1]
+ STR h2, [x9], 2
+ STR h3, [x7], 2
+ DUP h2, v2.h[1]
+ DUP h3, v3.h[1]
+6:
+ TBZ x1, 0, 7f
+ STR b0, [x6]
+ STR b1, [x8]
+ STR b2, [x9]
+ STR b3, [x7]
+7:
+ RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
index cb7b592..d3e8d6e 100644
--- a/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
+++ b/src/qs8-igemm/4x16-aarch64-neon-mlal-lane-ld64.S.in
@@ -55,7 +55,7 @@
# A1 x14 v1
# A2 x15 v2
# A3 x20 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x16 v17 v21 v25 v29
# C2 x17 v18 v22 v26 v30
@@ -65,7 +65,6 @@
# unused v8 v9 v10 v11 v12 v13 v14 v15
$else:
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
BEGIN_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
@@ -81,7 +80,7 @@
CSEL x17, x16, x17, LS // c2 = c1
CMP x0, 4 // if mr < 4
- STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ STR x20, [sp, -16]! // Save x20 on stack
ADD x7, x17, x7 // c3 = c2 + cm_stride
CSEL x7, x17, x7, LO // c3 = c2
@@ -571,8 +570,8 @@
# nc loop
B.HI 0b
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
# Remainder- 1 to 7 bytes of A
@@ -819,8 +818,8 @@
STR b1, [x16]
STR b0, [x6]
9:
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
END_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
diff --git a/src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in b/src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
new file mode 100644
index 0000000..f7f91b2
--- /dev/null
+++ b/src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
@@ -0,0 +1,559 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert REQUANTIZATION in ["FP32", "RNDNU"]
+$assert not CHANNELWISE or REQUANTIZATION == "FP32"
+$assert DATATYPE in ["QC8", "QS8", "QU8"]
+$assert DATATYPE != "QC8" or REQUANTIZATION == "FP32"
+$assert DATATYPE != "QU8" or REQUANTIZATION == "RNDNU"
+
+#include <xnnpack/assembly.h>
+
+$PARAMS_UNION = "xnn_qs8_minmax_params" if CHANNELWISE else "xnn_qs8_conv_minmax_params"
+$if DATATYPE == "QU8":
+ $REWIND_DECREMENT = 19
+$else:
+ $REWIND_DECREMENT = 3 if CHANNELWISE else {"RNDNU": 15, "FP32": 7}[REQUANTIZATION]
+$XMIN = "UMIN" if DATATYPE == "QU8" else "SMIN"
+$XMAX = "UMAX" if DATATYPE == "QU8" else "SMAX"
+$XXTL = "UXTL" if DATATYPE == "QU8" else "SXTL"
+$SQXTXN = "SQXTUN" if DATATYPE == "QU8" else "SQXTN"
+$SQXTXN2 = "SQXTUN2" if DATATYPE == "QU8" else "SQXTN2"
+$XINT8_T = "uint8_t" if DATATYPE == "QU8" else "int8_t"
+# void xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const ${XINT8_T}** restrict a, x4
+# const ${XINT8_T}* restrict w, x5
+# ${XINT8_T}* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x10
+# size_t a_offset, [sp + 8] -> x8
+# const ${XINT8_T}* zero, [sp + 16] -> x12
+# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+$if REQUANTIZATION == "RNDNU" and DATATYPE == "QU8":
+ # params structure is 20 bytes
+ # struct {
+ # ${XINT8_T} kernel_zero_point[4];
+ # int32_t right_pre_shift;
+ # int32_t multiplier;
+ # int32_t right_post_shift;
+ # int16_t output_zero_point;
+ # ${XINT8_T} output_min;
+ # ${XINT8_T} output_max;
+ # } rndnu_neon;
+ #
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x20 v3
+# B x5 v5
+# C0 x6 v24 v28
+# C1 x16 v25 v29
+# C2 x17 v26 v30
+# C3 x7 v27 v31
+$if DATATYPE == "QU8":
+ # zero_point v7
+ # unused v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+$else:
+ # unused v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+
+BEGIN_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x10, x8, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ LDP x12, x11, [sp, 16] // Load zero, params pointer
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ STR x20, [sp, -16]! // Save x20 on stack
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+ $if DATATYPE == "QU8":
+ LD1R {v7.4s}, [x11] // kernel_zero_point
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q24, q28, [x5], 32
+ $if DATATYPE == "QU8":
+ ADD x11, x11, 4 // adjust params pointer
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x20, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x8 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x8 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x8 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x20, x12 // if a3 == zero
+ ADD x20, x20, x8 // a3 += a_offset
+ CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 8 bytes for main loop?
+ SUBS x0, x2, 8 // k = kc - 8
+ B.LO 4f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+2:
+ LD1 {v0.8b}, [x13], 8
+ LDR d5, [x5], 8
+ LD1 {v1.8b}, [x14], 8
+ LD1 {v2.8b}, [x15], 8
+ LD1 {v3.8b}, [x20], 8
+ ${XXTL} v0.8h, v0.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ ${XXTL} v1.8h, v1.8b
+ ${XXTL} v2.8h, v2.8b
+ ${XXTL} v3.8h, v3.8b
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x13, 128]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x14, 128]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x15, 128]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x20, 128]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 448]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ $if PREFETCH:
+ PRFM PLDL1KEEP, [x5, 512]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 2b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 4f
+
+3:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(${XINT8_T}*)
+ B.HI 1b
+
+ $if REQUANTIZATION == "RNDNU":
+ # Apply params - preshift, scale, postshift, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SQSHL v24.4s, v24.4s, v4.4s // shift to upper bits
+ SQSHL v25.4s, v25.4s, v4.4s
+ SQSHL v26.4s, v26.4s, v4.4s
+ SQSHL v27.4s, v27.4s, v4.4s
+ LD1R {v5.4s}, [x11], 4
+ SQSHL v28.4s, v28.4s, v4.4s
+ SQSHL v29.4s, v29.4s, v4.4s
+ SQSHL v30.4s, v30.4s, v4.4s
+ SQSHL v31.4s, v31.4s, v4.4s
+ LD1R {v6.4s}, [x11], 4
+ SQDMULH v24.4s, v24.4s, v5.4s // scale without rounding
+ SQDMULH v25.4s, v25.4s, v5.4s
+ SQDMULH v26.4s, v26.4s, v5.4s
+ SQDMULH v27.4s, v27.4s, v5.4s
+ SQDMULH v28.4s, v28.4s, v5.4s
+ SQDMULH v29.4s, v29.4s, v5.4s
+ SQDMULH v30.4s, v30.4s, v5.4s
+ SQDMULH v31.4s, v31.4s, v5.4s
+ SRSHL v24.4s, v24.4s, v6.4s // signed rounding shift left
+ SRSHL v25.4s, v25.4s, v6.4s
+ SRSHL v26.4s, v26.4s, v6.4s
+ SRSHL v27.4s, v27.4s, v6.4s
+ SRSHL v28.4s, v28.4s, v6.4s
+ SRSHL v29.4s, v29.4s, v6.4s
+ SRSHL v30.4s, v30.4s, v6.4s
+ SRSHL v31.4s, v31.4s, v6.4s
+ $elif REQUANTIZATION == "FP32":
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ $if not CHANNELWISE:
+ # Apply params - scale, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ $else:
+ # Load per channel scale values from weights
+ LDR q4, [x5], 16
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ LDR q5, [x5], 16
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ $if CHANNELWISE:
+ LDR q6, [x5], 16
+ FMUL v24.4s, v24.4s, v6.4s
+ FMUL v25.4s, v25.4s, v6.4s
+ FMUL v26.4s, v26.4s, v6.4s
+ FMUL v27.4s, v27.4s, v6.4s
+ LDR q4, [x5], 16
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+ $else:
+ FMUL v24.4s, v24.4s, v4.4s
+ FMUL v25.4s, v25.4s, v4.4s
+ FMUL v26.4s, v26.4s, v4.4s
+ FMUL v27.4s, v27.4s, v4.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+ LD1R {v4.8b}, [x11], 1 // clamp min value
+
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v5.8b}, [x11] // clamp max value
+ ${SQXTXN} v0.8b, v24.8h
+ ${SQXTXN} v1.8b, v25.8h
+ ${SQXTXN} v2.8b, v26.8h
+ ${SQXTXN} v3.8b, v27.8h
+ SUB x11, x11, ${REWIND_DECREMENT} // rewind params pointer
+
+ ${XMAX} v0.8b, v0.8b, v4.8b
+ ${XMAX} v1.8b, v1.8b, v4.8b
+ ${XMAX} v2.8b, v2.8b, v4.8b
+ ${XMAX} v3.8b, v3.8b, v4.8b
+ SUBS x1, x1, 8
+ ${XMIN} v0.8b, v0.8b, v5.8b
+ ${XMIN} v1.8b, v1.8b, v5.8b
+ ${XMIN} v2.8b, v2.8b, v5.8b
+ ${XMIN} v3.8b, v3.8b, v5.8b
+ B.LO 5f
+
+ # Store full 4 x 8
+ ST1 {v3.8b}, [x7], x10
+ ST1 {v2.8b}, [x17], x10
+ ST1 {v1.8b}, [x16], x10
+ ST1 {v0.8b}, [x6], x10
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore x20 from stack
+ LDR x20, [sp], 16
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+4:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x13], x0
+ LDR d5, [x5], 8
+ LD1 {v1.8b}, [x14], x0
+ LD1 {v2.8b}, [x15], x0
+ LD1 {v3.8b}, [x20], x0
+ ${XXTL} v0.8h, v0.8b
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ ${XXTL} v1.8h, v1.8b
+ ${XXTL} v2.8h, v2.8b
+ ${XXTL} v3.8h, v3.8b
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 3b
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 3b
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 3b
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 3b
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 3b
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 3b
+
+ LDR d5, [x5], 8
+ $if DATATYPE == "QU8":
+ USUBL v5.8h, v5.8b, v7.8b
+ $else:
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+5:
+ TBZ x1, 2, 6f
+ STR s3, [x7], 4
+ STR s2, [x17], 4
+ DUP s3, v3.s[1]
+ DUP s2, v2.s[1]
+ STR s1, [x16], 4
+ STR s0, [x6], 4
+ DUP s1, v1.s[1]
+ DUP s0, v0.s[1]
+6:
+ TBZ x1, 1, 7f
+ STR h3, [x7], 2
+ STR h2, [x17], 2
+ DUP h3, v3.h[1]
+ DUP h2, v2.h[1]
+ STR h1, [x16], 2
+ STR h0, [x6], 2
+ DUP h1, v1.h[1]
+ DUP h0, v0.h[1]
+7:
+ TBZ x1, 0, 8f
+ STR b3, [x7]
+ STR b2, [x17]
+ STR b1, [x16]
+ STR b0, [x6]
+8:
+ # Restore x20 from stack
+ LDR x20, [sp], 16
+ RET
+
+END_FUNCTION xnn_${DATATYPE.lower()}_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x8__aarch64_neon_mlal_lane${"_prfm" if PREFETCH else ""}_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
index 42794c2..dfcdb5e 100644
--- a/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-ld64.S
@@ -31,13 +31,12 @@
# A1 x14 v1
# A2 x15 v2
# A3 x20 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x16 v17 v21 v25 v29
# C2 x17 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
@@ -53,7 +52,7 @@
CSEL x17, x16, x17, LS // c2 = c1
CMP x0, 4 // if mr < 4
- STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ STR x20, [sp, -16]! // Save x20 on stack
ADD x7, x17, x7 // c3 = c2 + cm_stride
CSEL x7, x17, x7, LO // c3 = c2
@@ -398,8 +397,8 @@
# nc loop
B.HI 0b
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
# Remainder- 1 to 7 bytes of A
@@ -604,8 +603,8 @@
STR b1, [x16]
STR b0, [x6]
9:
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_ld64
diff --git a/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
index b9cda9e..6c9bfb9 100644
--- a/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qs8-igemm/gen/4x16-minmax-fp32-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -31,13 +31,12 @@
# A1 x14 v1
# A2 x15 v2
# A3 x20 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x16 v17 v21 v25 v29
# C2 x17 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
@@ -53,7 +52,7 @@
CSEL x17, x16, x17, LS // c2 = c1
CMP x0, 4 // if mr < 4
- STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ STR x20, [sp, -16]! // Save x20 on stack
ADD x7, x17, x7 // c3 = c2 + cm_stride
CSEL x7, x17, x7, LO // c3 = c2
@@ -404,8 +403,8 @@
# nc loop
B.HI 0b
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
# Remainder- 1 to 7 bytes of A
@@ -610,8 +609,8 @@
STR b1, [x16]
STR b0, [x6]
9:
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
diff --git a/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
index 2502537..eeee346 100644
--- a/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -31,13 +31,12 @@
# A1 x14 v1
# A2 x15 v2
# A3 x20 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x16 v17 v21 v25 v29
# C2 x17 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
@@ -53,7 +52,7 @@
CSEL x17, x16, x17, LS // c2 = c1
CMP x0, 4 // if mr < 4
- STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ STR x20, [sp, -16]! // Save x20 on stack
ADD x7, x17, x7 // c3 = c2 + cm_stride
CSEL x7, x17, x7, LO // c3 = c2
@@ -398,8 +397,8 @@
# nc loop
B.HI 0b
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
# Remainder- 1 to 7 bytes of A
@@ -604,8 +603,8 @@
STR b1, [x16]
STR b0, [x6]
9:
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
diff --git a/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
index 70f0419..fc73819 100644
--- a/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -31,13 +31,12 @@
# A1 x14 v1
# A2 x15 v2
# A3 x20 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x16 v17 v21 v25 v29
# C2 x17 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# unused v7 v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
@@ -53,7 +52,7 @@
CSEL x17, x16, x17, LS // c2 = c1
CMP x0, 4 // if mr < 4
- STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ STR x20, [sp, -16]! // Save x20 on stack
ADD x7, x17, x7 // c3 = c2 + cm_stride
CSEL x7, x17, x7, LO // c3 = c2
@@ -404,8 +403,8 @@
# nc loop
B.HI 0b
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
# Remainder- 1 to 7 bytes of A
@@ -610,8 +609,8 @@
STR b1, [x16]
STR b0, [x6]
9:
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
diff --git a/src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
new file mode 100644
index 0000000..ed98da2
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -0,0 +1,420 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t** restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x10
+# size_t a_offset, [sp + 8] -> x8
+# const int8_t* zero, [sp + 16] -> x12
+# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x20 v3
+# B x5 v5
+# C0 x6 v24 v28
+# C1 x16 v25 v29
+# C2 x17 v26 v30
+# C3 x7 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x10, x8, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ LDP x12, x11, [sp, 16] // Load zero, params pointer
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ STR x20, [sp, -16]! // Save x20 on stack
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q24, q28, [x5], 32
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x20, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x8 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x8 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x8 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x20, x12 // if a3 == zero
+ ADD x20, x20, x8 // a3 += a_offset
+ CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 8 bytes for main loop?
+ SUBS x0, x2, 8 // k = kc - 8
+ B.LO 4f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+2:
+ LD1 {v0.8b}, [x13], 8
+ LDR d5, [x5], 8
+ LD1 {v1.8b}, [x14], 8
+ LD1 {v2.8b}, [x15], 8
+ LD1 {v3.8b}, [x20], 8
+ SXTL v0.8h, v0.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 2b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 4f
+
+3:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+ # Apply params - preshift, scale, postshift, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SQSHL v24.4s, v24.4s, v4.4s // shift to upper bits
+ SQSHL v25.4s, v25.4s, v4.4s
+ SQSHL v26.4s, v26.4s, v4.4s
+ SQSHL v27.4s, v27.4s, v4.4s
+ LD1R {v5.4s}, [x11], 4
+ SQSHL v28.4s, v28.4s, v4.4s
+ SQSHL v29.4s, v29.4s, v4.4s
+ SQSHL v30.4s, v30.4s, v4.4s
+ SQSHL v31.4s, v31.4s, v4.4s
+ LD1R {v6.4s}, [x11], 4
+ SQDMULH v24.4s, v24.4s, v5.4s // scale without rounding
+ SQDMULH v25.4s, v25.4s, v5.4s
+ SQDMULH v26.4s, v26.4s, v5.4s
+ SQDMULH v27.4s, v27.4s, v5.4s
+ SQDMULH v28.4s, v28.4s, v5.4s
+ SQDMULH v29.4s, v29.4s, v5.4s
+ SQDMULH v30.4s, v30.4s, v5.4s
+ SQDMULH v31.4s, v31.4s, v5.4s
+ SRSHL v24.4s, v24.4s, v6.4s // signed rounding shift left
+ SRSHL v25.4s, v25.4s, v6.4s
+ SRSHL v26.4s, v26.4s, v6.4s
+ SRSHL v27.4s, v27.4s, v6.4s
+ SRSHL v28.4s, v28.4s, v6.4s
+ SRSHL v29.4s, v29.4s, v6.4s
+ SRSHL v30.4s, v30.4s, v6.4s
+ SRSHL v31.4s, v31.4s, v6.4s
+
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+ LD1R {v4.8b}, [x11], 1 // clamp min value
+
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v5.8b}, [x11] // clamp max value
+ SQXTN v0.8b, v24.8h
+ SQXTN v1.8b, v25.8h
+ SQXTN v2.8b, v26.8h
+ SQXTN v3.8b, v27.8h
+ SUB x11, x11, 15 // rewind params pointer
+
+ SMAX v0.8b, v0.8b, v4.8b
+ SMAX v1.8b, v1.8b, v4.8b
+ SMAX v2.8b, v2.8b, v4.8b
+ SMAX v3.8b, v3.8b, v4.8b
+ SUBS x1, x1, 8
+ SMIN v0.8b, v0.8b, v5.8b
+ SMIN v1.8b, v1.8b, v5.8b
+ SMIN v2.8b, v2.8b, v5.8b
+ SMIN v3.8b, v3.8b, v5.8b
+ B.LO 5f
+
+ # Store full 4 x 8
+ ST1 {v3.8b}, [x7], x10
+ ST1 {v2.8b}, [x17], x10
+ ST1 {v1.8b}, [x16], x10
+ ST1 {v0.8b}, [x6], x10
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore x20 from stack
+ LDR x20, [sp], 16
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+4:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x13], x0
+ LDR d5, [x5], 8
+ LD1 {v1.8b}, [x14], x0
+ LD1 {v2.8b}, [x15], x0
+ LD1 {v3.8b}, [x20], x0
+ SXTL v0.8h, v0.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 3b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 3b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 3b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 3b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 3b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 3b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+5:
+ TBZ x1, 2, 6f
+ STR s3, [x7], 4
+ STR s2, [x17], 4
+ DUP s3, v3.s[1]
+ DUP s2, v2.s[1]
+ STR s1, [x16], 4
+ STR s0, [x6], 4
+ DUP s1, v1.s[1]
+ DUP s0, v0.s[1]
+6:
+ TBZ x1, 1, 7f
+ STR h3, [x7], 2
+ STR h2, [x17], 2
+ DUP h3, v3.h[1]
+ DUP h2, v2.h[1]
+ STR h1, [x16], 2
+ STR h0, [x6], 2
+ DUP h1, v1.h[1]
+ DUP h0, v0.h[1]
+7:
+ TBZ x1, 0, 8f
+ STR b3, [x7]
+ STR b2, [x17]
+ STR b1, [x16]
+ STR b0, [x6]
+8:
+ # Restore x20 from stack
+ LDR x20, [sp], 16
+ RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
new file mode 100644
index 0000000..3f0fc8c
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -0,0 +1,426 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/4x8-aarch64-neon-mlal-lane-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t** restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x10
+# size_t a_offset, [sp + 8] -> x8
+# const int8_t* zero, [sp + 16] -> x12
+# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x20 v3
+# B x5 v5
+# C0 x6 v24 v28
+# C1 x16 v25 v29
+# C2 x17 v26 v30
+# C3 x7 v27 v31
+# unused v7 v8 v9 v10 v11 v12 v13 v14 v15 v16 v17 v18 v19 v20 v21 v22 v23
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x10, x8, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ LDP x12, x11, [sp, 16] // Load zero, params pointer
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ STR x20, [sp, -16]! // Save x20 on stack
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q24, q28, [x5], 32
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x20, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x8 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x8 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x8 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x20, x12 // if a3 == zero
+ ADD x20, x20, x8 // a3 += a_offset
+ CSEL x20, x12, x20, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 8 bytes for main loop?
+ SUBS x0, x2, 8 // k = kc - 8
+ B.LO 4f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+2:
+ LD1 {v0.8b}, [x13], 8
+ LDR d5, [x5], 8
+ LD1 {v1.8b}, [x14], 8
+ LD1 {v2.8b}, [x15], 8
+ LD1 {v3.8b}, [x20], 8
+ SXTL v0.8h, v0.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ PRFM PLDL1KEEP, [x13, 128]
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ PRFM PLDL1KEEP, [x14, 128]
+ PRFM PLDL1KEEP, [x15, 128]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ PRFM PLDL1KEEP, [x20, 128]
+ PRFM PLDL1KEEP, [x5, 448]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ PRFM PLDL1KEEP, [x5, 512]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[7]
+ SMLAL2 v28.4s, v5.8h, v0.h[7]
+ SMLAL v25.4s, v5.4h, v1.h[7]
+ SMLAL2 v29.4s, v5.8h, v1.h[7]
+ SMLAL v26.4s, v5.4h, v2.h[7]
+ SMLAL2 v30.4s, v5.8h, v2.h[7]
+ SMLAL v27.4s, v5.4h, v3.h[7]
+ SMLAL2 v31.4s, v5.8h, v3.h[7]
+
+ SUBS x0, x0, 8
+ B.HS 2b
+
+ AND x0, x2, 7 // kc remainder 0 to 7
+ # Is there a remainder?- 1 to 7 bytes of A
+ CBNZ x0, 4f
+
+3:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+ # Apply params - preshift, scale, postshift, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SQSHL v24.4s, v24.4s, v4.4s // shift to upper bits
+ SQSHL v25.4s, v25.4s, v4.4s
+ SQSHL v26.4s, v26.4s, v4.4s
+ SQSHL v27.4s, v27.4s, v4.4s
+ LD1R {v5.4s}, [x11], 4
+ SQSHL v28.4s, v28.4s, v4.4s
+ SQSHL v29.4s, v29.4s, v4.4s
+ SQSHL v30.4s, v30.4s, v4.4s
+ SQSHL v31.4s, v31.4s, v4.4s
+ LD1R {v6.4s}, [x11], 4
+ SQDMULH v24.4s, v24.4s, v5.4s // scale without rounding
+ SQDMULH v25.4s, v25.4s, v5.4s
+ SQDMULH v26.4s, v26.4s, v5.4s
+ SQDMULH v27.4s, v27.4s, v5.4s
+ SQDMULH v28.4s, v28.4s, v5.4s
+ SQDMULH v29.4s, v29.4s, v5.4s
+ SQDMULH v30.4s, v30.4s, v5.4s
+ SQDMULH v31.4s, v31.4s, v5.4s
+ SRSHL v24.4s, v24.4s, v6.4s // signed rounding shift left
+ SRSHL v25.4s, v25.4s, v6.4s
+ SRSHL v26.4s, v26.4s, v6.4s
+ SRSHL v27.4s, v27.4s, v6.4s
+ SRSHL v28.4s, v28.4s, v6.4s
+ SRSHL v29.4s, v29.4s, v6.4s
+ SRSHL v30.4s, v30.4s, v6.4s
+ SRSHL v31.4s, v31.4s, v6.4s
+
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+ LD1R {v4.8b}, [x11], 1 // clamp min value
+
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v5.8b}, [x11] // clamp max value
+ SQXTN v0.8b, v24.8h
+ SQXTN v1.8b, v25.8h
+ SQXTN v2.8b, v26.8h
+ SQXTN v3.8b, v27.8h
+ SUB x11, x11, 15 // rewind params pointer
+
+ SMAX v0.8b, v0.8b, v4.8b
+ SMAX v1.8b, v1.8b, v4.8b
+ SMAX v2.8b, v2.8b, v4.8b
+ SMAX v3.8b, v3.8b, v4.8b
+ SUBS x1, x1, 8
+ SMIN v0.8b, v0.8b, v5.8b
+ SMIN v1.8b, v1.8b, v5.8b
+ SMIN v2.8b, v2.8b, v5.8b
+ SMIN v3.8b, v3.8b, v5.8b
+ B.LO 5f
+
+ # Store full 4 x 8
+ ST1 {v3.8b}, [x7], x10
+ ST1 {v2.8b}, [x17], x10
+ ST1 {v1.8b}, [x16], x10
+ ST1 {v0.8b}, [x6], x10
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore x20 from stack
+ LDR x20, [sp], 16
+ RET
+
+ # Remainder- 1 to 7 bytes of A
+ .p2align 3
+4:
+ AND x0, x2, 7 // kc remainder 1 to 7
+
+ LD1 {v0.8b}, [x13], x0
+ LDR d5, [x5], 8
+ LD1 {v1.8b}, [x14], x0
+ LD1 {v2.8b}, [x15], x0
+ LD1 {v3.8b}, [x20], x0
+ SXTL v0.8h, v0.8b
+ SXTL v5.8h, v5.8b
+ SXTL v1.8h, v1.8b
+ SXTL v2.8h, v2.8b
+ SXTL v3.8h, v3.8b
+ SMLAL v24.4s, v5.4h, v0.h[0]
+ SMLAL2 v28.4s, v5.8h, v0.h[0]
+ SMLAL v25.4s, v5.4h, v1.h[0]
+ SMLAL2 v29.4s, v5.8h, v1.h[0]
+ SMLAL v26.4s, v5.4h, v2.h[0]
+ SMLAL2 v30.4s, v5.8h, v2.h[0]
+ SMLAL v27.4s, v5.4h, v3.h[0]
+ SMLAL2 v31.4s, v5.8h, v3.h[0]
+ CMP x0, 2
+ B.LO 3b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[1]
+ SMLAL2 v28.4s, v5.8h, v0.h[1]
+ SMLAL v25.4s, v5.4h, v1.h[1]
+ SMLAL2 v29.4s, v5.8h, v1.h[1]
+ SMLAL v26.4s, v5.4h, v2.h[1]
+ SMLAL2 v30.4s, v5.8h, v2.h[1]
+ SMLAL v27.4s, v5.4h, v3.h[1]
+ SMLAL2 v31.4s, v5.8h, v3.h[1]
+ B.EQ 3b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[2]
+ SMLAL2 v28.4s, v5.8h, v0.h[2]
+ SMLAL v25.4s, v5.4h, v1.h[2]
+ SMLAL2 v29.4s, v5.8h, v1.h[2]
+ SMLAL v26.4s, v5.4h, v2.h[2]
+ SMLAL2 v30.4s, v5.8h, v2.h[2]
+ SMLAL v27.4s, v5.4h, v3.h[2]
+ SMLAL2 v31.4s, v5.8h, v3.h[2]
+ CMP x0, 4
+ B.LO 3b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[3]
+ SMLAL2 v28.4s, v5.8h, v0.h[3]
+ SMLAL v25.4s, v5.4h, v1.h[3]
+ SMLAL2 v29.4s, v5.8h, v1.h[3]
+ SMLAL v26.4s, v5.4h, v2.h[3]
+ SMLAL2 v30.4s, v5.8h, v2.h[3]
+ SMLAL v27.4s, v5.4h, v3.h[3]
+ SMLAL2 v31.4s, v5.8h, v3.h[3]
+ B.EQ 3b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[4]
+ SMLAL2 v28.4s, v5.8h, v0.h[4]
+ SMLAL v25.4s, v5.4h, v1.h[4]
+ SMLAL2 v29.4s, v5.8h, v1.h[4]
+ SMLAL v26.4s, v5.4h, v2.h[4]
+ SMLAL2 v30.4s, v5.8h, v2.h[4]
+ SMLAL v27.4s, v5.4h, v3.h[4]
+ SMLAL2 v31.4s, v5.8h, v3.h[4]
+ CMP x0, 6
+ B.LO 3b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[5]
+ SMLAL2 v28.4s, v5.8h, v0.h[5]
+ SMLAL v25.4s, v5.4h, v1.h[5]
+ SMLAL2 v29.4s, v5.8h, v1.h[5]
+ SMLAL v26.4s, v5.4h, v2.h[5]
+ SMLAL2 v30.4s, v5.8h, v2.h[5]
+ SMLAL v27.4s, v5.4h, v3.h[5]
+ SMLAL2 v31.4s, v5.8h, v3.h[5]
+ B.EQ 3b
+
+ LDR d5, [x5], 8
+ SXTL v5.8h, v5.8b
+ SMLAL v24.4s, v5.4h, v0.h[6]
+ SMLAL2 v28.4s, v5.8h, v0.h[6]
+ SMLAL v25.4s, v5.4h, v1.h[6]
+ SMLAL2 v29.4s, v5.8h, v1.h[6]
+ SMLAL v26.4s, v5.4h, v2.h[6]
+ SMLAL2 v30.4s, v5.8h, v2.h[6]
+ SMLAL v27.4s, v5.4h, v3.h[6]
+ SMLAL2 v31.4s, v5.8h, v3.h[6]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+5:
+ TBZ x1, 2, 6f
+ STR s3, [x7], 4
+ STR s2, [x17], 4
+ DUP s3, v3.s[1]
+ DUP s2, v2.s[1]
+ STR s1, [x16], 4
+ STR s0, [x6], 4
+ DUP s1, v1.s[1]
+ DUP s0, v0.s[1]
+6:
+ TBZ x1, 1, 7f
+ STR h3, [x7], 2
+ STR h2, [x17], 2
+ DUP h3, v3.h[1]
+ DUP h2, v2.h[1]
+ STR h1, [x16], 2
+ STR h0, [x6], 2
+ DUP h1, v1.h[1]
+ DUP h0, v0.h[1]
+7:
+ TBZ x1, 0, 8f
+ STR b3, [x7]
+ STR b2, [x17]
+ STR b1, [x16]
+ STR b0, [x6]
+8:
+ # Restore x20 from stack
+ LDR x20, [sp], 16
+ RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
index 59ecaa4..9f53c93 100644
--- a/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -40,7 +40,7 @@
# A1 x15 v1
# A2 x13 v2
# A3 x4 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x8 v17 v21 v25 v29
# C2 x9 v18 v22 v26 v30
@@ -48,8 +48,6 @@
# zero_point v7
# unused v8 v9 v10 v11 v12 v13 v14 v15
-# x10 x17 a53 temp registers
-
BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
# Clamp A and C pointers
diff --git a/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
index a4bc0d1..642f8fb 100644
--- a/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -40,7 +40,7 @@
# A1 x15 v1
# A2 x13 v2
# A3 x4 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x8 v17 v21 v25 v29
# C2 x9 v18 v22 v26 v30
@@ -48,8 +48,6 @@
# zero_point v7
# unused v8 v9 v10 v11 v12 v13 v14 v15
-# x10 x17 a53 temp registers
-
BEGIN_FUNCTION xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
# Clamp A and C pointers
diff --git a/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S b/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
index a7c3e77..1f88d52 100644
--- a/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
+++ b/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-ld64.S
@@ -42,14 +42,13 @@
# A1 x14 v1
# A2 x15 v2
# A3 x20 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x16 v17 v21 v25 v29
# C2 x17 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# zero_point v7
# unused v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
@@ -65,7 +64,7 @@
CSEL x17, x16, x17, LS // c2 = c1
CMP x0, 4 // if mr < 4
- STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ STR x20, [sp, -16]! // Save x20 on stack
ADD x7, x17, x7 // c3 = c2 + cm_stride
CSEL x7, x17, x7, LO // c3 = c2
@@ -412,8 +411,8 @@
# nc loop
B.HI 0b
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
# Remainder- 1 to 7 bytes of A
@@ -618,8 +617,8 @@
STR b1, [x16]
STR b0, [x6]
9:
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64
diff --git a/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S b/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
index ec7eff2..dc1f6fa 100644
--- a/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
+++ b/src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
@@ -42,14 +42,13 @@
# A1 x14 v1
# A2 x15 v2
# A3 x20 v3
-# B x5 v4 v5 v6
+# B x5 v4 v5
# C0 x6 v16 v20 v24 v28
# C1 x16 v17 v21 v25 v29
# C2 x17 v18 v22 v26 v30
# C3 x7 v19 v23 v27 v31
# zero_point v7
# unused v8 v9 v10 v11 v12 v13 v14 v15
-# x11, x21 temp for Cortex-A53 loads
BEGIN_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
@@ -65,7 +64,7 @@
CSEL x17, x16, x17, LS // c2 = c1
CMP x0, 4 // if mr < 4
- STP x20, x21, [sp, -16]! // Save x20-x21 on stack
+ STR x20, [sp, -16]! // Save x20 on stack
ADD x7, x17, x7 // c3 = c2 + cm_stride
CSEL x7, x17, x7, LO // c3 = c2
@@ -418,8 +417,8 @@
# nc loop
B.HI 0b
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
# Remainder- 1 to 7 bytes of A
@@ -624,8 +623,8 @@
STR b1, [x16]
STR b0, [x6]
9:
- # Restore x20-x21 from stack
- LDP x20, x21, [sp], 16
+ # Restore x20 from stack
+ LDR x20, [sp], 16
RET
END_FUNCTION xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index e7b77ab..1da47f6 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -1006,6 +1006,9 @@
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64)
+
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index c6067b4..b338f8c 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -829,6 +829,9 @@
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64)
+
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64)
diff --git a/test/qs8-gemm-minmax-rndnu.cc b/test/qs8-gemm-minmax-rndnu.cc
index 06aebe6..906d414 100644
--- a/test/qs8-gemm-minmax-rndnu.cc
+++ b/test/qs8-gemm-minmax-rndnu.cc
@@ -22,6 +22,918 @@
#include "gemm-microkernel-tester.h"
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_GEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+#endif // XNN_ARCH_ARM64
+
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QS8_GEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_eq_8) {
TEST_REQUIRES_ARM_NEON;
diff --git a/test/qs8-gemm-minmax-rndnu.yaml b/test/qs8-gemm-minmax-rndnu.yaml
index a13d876..9d3ffb6 100644
--- a/test/qs8-gemm-minmax-rndnu.yaml
+++ b/test/qs8-gemm-minmax-rndnu.yaml
@@ -3,6 +3,12 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64
+ init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+ k-block: 8
+- name: xnn_qs8_gemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64
+ init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+ k-block: 8
- name: xnn_qs8_gemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r
init: xnn_init_qs8_conv_minmax_rndnu_neon_params
k-block: 8
diff --git a/test/qs8-igemm-minmax-rndnu.cc b/test/qs8-igemm-minmax-rndnu.cc
index a6fef58..fb77800 100644
--- a/test/qs8-igemm-minmax-rndnu.cc
+++ b/test/qs8-igemm-minmax-rndnu.cc
@@ -22,6 +22,942 @@
#include "gemm-microkernel-tester.h"
+#if XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_PRFM_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_RNDNU_4X8__AARCH64_NEON_MLAL_LANE_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64, xnn_init_qs8_conv_minmax_rndnu_neon_params, xnn_init_qs8_requantization_rndnu_params, xnn_qs8_requantize_rndnu);
+ }
+#endif // XNN_ARCH_ARM64
+
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QS8_IGEMM_MINMAX_RNDNU_1X8C2__NEON_MULL_LD1R, k_eq_8) {
TEST_REQUIRES_ARM_NEON;
diff --git a/test/qs8-igemm-minmax-rndnu.yaml b/test/qs8-igemm-minmax-rndnu.yaml
index 6bf87ad..c8a3b19 100644
--- a/test/qs8-igemm-minmax-rndnu.yaml
+++ b/test/qs8-igemm-minmax-rndnu.yaml
@@ -3,6 +3,12 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_prfm_ld64
+ init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_rndnu_ukernel_4x8__aarch64_neon_mlal_lane_ld64
+ init: xnn_init_qs8_conv_minmax_rndnu_neon_params
+ k-block: 8
- name: xnn_qs8_igemm_minmax_rndnu_ukernel_1x8c2__neon_mull_ld1r
init: xnn_init_qs8_conv_minmax_rndnu_neon_params
k-block: 8