NEON Dot Product implementations of QS8 FP32 c4 GEMM and IGEMM assembly microkernels
PiperOrigin-RevId: 382208248
diff --git a/BUILD.bazel b/BUILD.bazel
index 21c8af5..58f1281 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4279,6 +4279,8 @@
"src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
"src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S",
"src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S",
+ "src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S",
+ "src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S",
"src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S",
"src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S",
"src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S",
@@ -4289,11 +4291,14 @@
"src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S",
"src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S",
"src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+ "src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
+ "src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S",
+ "src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S",
+ "src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
"src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S",
"src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S",
"src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S",
"src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S",
- "src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
"src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S",
"src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S",
"src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S",
@@ -4305,6 +4310,9 @@
"src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S",
"src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S",
"src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
+ "src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
+ "src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S",
+ "src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
"src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S",
"src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S",
"src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5d7517..220175e 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3503,6 +3503,8 @@
src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
+ src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S
+ src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S
src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
@@ -3513,11 +3515,14 @@
src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S
src/qs8-gemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S
+ src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
+ src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S
+ src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
+ src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
- src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-cortex-a53.S
src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm-cortex-a53.S
src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
@@ -3529,6 +3534,9 @@
src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-cortex-a53.S
src/qs8-igemm/gen/4x16-minmax-gemmlowp-aarch64-neon-mlal-lane-prfm-cortex-a53.S
+ src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
+ src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
+ src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S)
diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh
index eed993a..33fec41 100755
--- a/scripts/generate-qs8-gemm.sh
+++ b/scripts/generate-qs8-gemm.sh
@@ -197,15 +197,22 @@
tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=0 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal.S
tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=1 -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
-tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in -o src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
-tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in -o src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
tools/xngen src/qs8-gemm/2x8c16-aarch64-neon-mlal-padal.S.in -o src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
tools/xngen src/qs8-gemm/2x8c8-aarch64-neon-mull-padal.S.in -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mull-padal.S
-tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
-tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
-tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+### C4 micro-kernels
+tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
+tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
+tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
+tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
+
+tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in -D REQUANTIZATION=FP32 -o src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S
+tools/xngen src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in -D REQUANTIZATION=FP32 -o src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S
+tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=FP32 -o src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
+tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in -D REQUANTIZATION=FP32 -o src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S
+tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in -D REQUANTIZATION=FP32 -o src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
tools/xngen src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in -D REQUANTIZATION=FP32 -o src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
################################### x86 SSE ###################################
diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh
index 4387185..40df3ec 100755
--- a/scripts/generate-qs8-igemm.sh
+++ b/scripts/generate-qs8-igemm.sh
@@ -195,9 +195,15 @@
tools/xngen src/qs8-igemm/2x8c8-aarch64-neon-mlal-padal.S.in -D PREFETCH=1 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-aarch64-neon-mlal-padal-prfm.S
tools/xngen src/qs8-igemm/2x8c16-aarch64-neon-mlal-padal.S.in -o src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-aarch64-neon-mlal-padal.S
-tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
-tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
-tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
+
+### C4 micro-kernels
+tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=GEMMLOWP -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
+tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in -D REQUANTIZATION=GEMMLOWP -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in -D REQUANTIZATION=GEMMLOWP -o src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
+
+tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=FP32 -o src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
+tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in -D REQUANTIZATION=FP32 -o src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
+tools/xngen src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in -D REQUANTIZATION=FP32 -o src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
################################### x86 SSE ###################################
### C2 micro-kernels
diff --git a/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in b/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in
index 3d75e39..3043049 100644
--- a/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in
+++ b/src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in
@@ -5,7 +5,10 @@
#include <xnnpack/assembly.h>
-# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x16c4__aarch64_neondot_ld32(
# size_t mr, x0
# size_t nc, x1
# size_t kc, x2 / x0
@@ -15,7 +18,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x12
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+# const union ${CONV_PARAMS} params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -25,7 +28,7 @@
# C0 x6 v28 v29 v30 v31
# unused v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x16c4__aarch64_neondot_ld32
0:
# Load initial bias from w into accumulators
ADD x2, x2, 3 // kc = (kc + 3) & ~3
@@ -50,30 +53,54 @@
SDOT v31.4s, v19.16b, v0.4b[0]
B.HI 1b
- # Apply params - scale, shift, bias and clamp
- LD2R {v0.4s, v1.4s}, [x11], 8
- CMEQ v2.4s, v1.4s, 0
- SQRDMULH v4.4s, v28.4s, v0.4s
- SQRDMULH v5.4s, v29.4s, v0.4s
- SQRDMULH v6.4s, v30.4s, v0.4s
- SQRDMULH v7.4s, v31.4s, v0.4s
- BIC v28.16b, v28.16b, v2.16b
- BIC v29.16b, v29.16b, v2.16b
- BIC v30.16b, v30.16b, v2.16b
- BIC v31.16b, v31.16b, v2.16b
- SSRA v4.4s, v28.4s, 31 // signed shift right accumulate
- SSRA v5.4s, v29.4s, 31
- SSRA v6.4s, v30.4s, 31
- SSRA v7.4s, v31.4s, 31
- SRSHL v4.4s, v4.4s, v1.4s // signed rounding shift left
- SRSHL v5.4s, v5.4s, v1.4s
- SRSHL v6.4s, v6.4s, v1.4s
- SRSHL v7.4s, v7.4s, v1.4s
+ $if REQUANTIZATION == "GEMMLOWP":
+ # Apply params - scale, shift, bias and clamp
+ LD2R {v0.4s, v1.4s}, [x11], 8
+ CMEQ v2.4s, v1.4s, 0
+
+ BIC v4.16b, v28.16b, v2.16b
+ BIC v5.16b, v29.16b, v2.16b
+ BIC v6.16b, v30.16b, v2.16b
+ BIC v7.16b, v31.16b, v2.16b
+
+ SQRDMULH v28.4s, v28.4s, v0.4s
+ SQRDMULH v29.4s, v29.4s, v0.4s
+ SQRDMULH v30.4s, v30.4s, v0.4s
+ SQRDMULH v31.4s, v31.4s, v0.4s
+
+ SSRA v28.4s, v4.4s, 31
+ SSRA v29.4s, v5.4s, 31
+ SSRA v30.4s, v6.4s, 31
+ SSRA v31.4s, v7.4s, 31
+
+ SRSHL v28.4s, v28.4s, v1.4s // signed rounding shift left
+ SRSHL v29.4s, v29.4s, v1.4s
+ SRSHL v30.4s, v30.4s, v1.4s
+ SRSHL v31.4s, v31.4s, v1.4s
+ $elif REQUANTIZATION == "FP32":
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x11], 4
+
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
LD1R {v2.8h}, [x11], 2 // add bias
- SQXTN v4.4h, v4.4s
- SQXTN v6.4h, v6.4s
- SQXTN2 v4.8h, v5.4s
- SQXTN2 v6.8h, v7.4s
+ SQXTN v4.4h, v28.4s
+ SQXTN v6.4h, v30.4s
+ SQXTN2 v4.8h, v29.4s
+ SQXTN2 v6.8h, v31.4s
LD2R {v0.16b, v1.16b}, [x11] // clamp to min/max
SQADD v4.8h, v4.8h, v2.8h
SQADD v6.8h, v6.8h, v2.8h
@@ -111,7 +138,7 @@
6:
RET
-END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32
+END_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x16c4__aarch64_neondot_ld32
#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in b/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in
index a7973d6..a9a72b5 100644
--- a/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in
+++ b/src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in
@@ -5,7 +5,10 @@
#include <xnnpack/assembly.h>
-# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x16c4__aarch64_neondot_ld64(
# size_t mr, x0
# size_t nc, x1
# size_t kc, x2 / x0
@@ -15,7 +18,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x12
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+# const union ${CONV_PARAMS} params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -25,7 +28,7 @@
# C0 x6 v28 v29 v30 v31
# unused v8 v9 v10 v11 v12 v13 v14 v15
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x16c4__aarch64_neondot_ld64
ADD x2, x2, 3 // kc = (kc + 3) & ~3
BIC x2, x2, 3
@@ -68,30 +71,56 @@
TBNZ x0, 2, 3f
2:
- # Apply params - scale, shift, bias and clamp
- LD2R {v0.4s, v1.4s}, [x11], 8
- SQRDMULH v4.4s, v28.4s, v0.4s
- SQRDMULH v5.4s, v29.4s, v0.4s
- CMEQ v2.4s, v1.4s, 0
- SQRDMULH v6.4s, v30.4s, v0.4s
- SQRDMULH v7.4s, v31.4s, v0.4s
- BIC v28.16b, v28.16b, v2.16b
- BIC v29.16b, v29.16b, v2.16b
- BIC v30.16b, v30.16b, v2.16b
- BIC v31.16b, v31.16b, v2.16b
- SSRA v4.4s, v28.4s, 31 // signed shift right accumulate
- SSRA v5.4s, v29.4s, 31
- SSRA v6.4s, v30.4s, 31
- SSRA v7.4s, v31.4s, 31
- SRSHL v4.4s, v4.4s, v1.4s // signed rounding shift left
- SRSHL v5.4s, v5.4s, v1.4s
- SRSHL v6.4s, v6.4s, v1.4s
- SRSHL v7.4s, v7.4s, v1.4s
+
+ $if REQUANTIZATION == "GEMMLOWP":
+ # Apply params - scale, shift, bias and clamp
+ LD2R {v0.4s, v1.4s}, [x11], 8
+ CMEQ v2.4s, v1.4s, 0
+
+ BIC v4.16b, v28.16b, v2.16b
+ BIC v5.16b, v29.16b, v2.16b
+ BIC v6.16b, v30.16b, v2.16b
+ BIC v7.16b, v31.16b, v2.16b
+
+ SQRDMULH v28.4s, v28.4s, v0.4s
+ SQRDMULH v29.4s, v29.4s, v0.4s
+ SQRDMULH v30.4s, v30.4s, v0.4s
+ SQRDMULH v31.4s, v31.4s, v0.4s
+
+ SSRA v28.4s, v4.4s, 31
+ SSRA v29.4s, v5.4s, 31
+ SSRA v30.4s, v6.4s, 31
+ SSRA v31.4s, v7.4s, 31
+
+ SRSHL v28.4s, v28.4s, v1.4s // signed rounding shift left
+ SRSHL v29.4s, v29.4s, v1.4s
+ SRSHL v30.4s, v30.4s, v1.4s
+ SRSHL v31.4s, v31.4s, v1.4s
+ $elif REQUANTIZATION == "FP32":
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x11], 4
+
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
LD1R {v2.8h}, [x11], 2 // add bias
- SQXTN v4.4h, v4.4s
- SQXTN v6.4h, v6.4s
- SQXTN2 v4.8h, v5.4s
- SQXTN2 v6.8h, v7.4s
+ SQXTN v4.4h, v28.4s
+ SQXTN v6.4h, v30.4s
+ SQXTN2 v4.8h, v29.4s
+ SQXTN2 v6.8h, v31.4s
+
LD2R {v0.16b, v1.16b}, [x11] // clamp to min/max
SQADD v4.8h, v4.8h, v2.8h
SQADD v6.8h, v6.8h, v2.8h
@@ -145,7 +174,7 @@
8:
RET
-END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64
+END_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_1x16c4__aarch64_neondot_ld64
#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in b/src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
index d2eae7e..21d6f46 100644
--- a/src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
+++ b/src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
@@ -5,7 +5,10 @@
#include <xnnpack/assembly.h>
-# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55(
# size_t mr, x0
# size_t nc, x1
# size_t kc, x2 / x0
@@ -15,7 +18,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x12
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+# const union ${CONV_PARAMS} params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -31,7 +34,7 @@
# C3 x7 v19 v23 v27 v31
# unused v12 v13 v14 v15
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
# Clamp A and C pointers
CMP x0, 2 // if mr < 2
@@ -65,7 +68,7 @@
LDP q24, q28, [x5], 32
MOV v19.16b, v16.16b
MOV v21.16b, v20.16b
- LDR x11, [sp, 40] // params
+ LDR x11, [sp, 40] // reload params
MOV v22.16b, v20.16b
MOV v23.16b, v20.16b
MOV v25.16b, v24.16b
@@ -403,131 +406,191 @@
.p2align 3
3:
- # Apply params - scale, shift, bias and clamp
- LD1R {v0.4s}, [x11], 4
- SQRDMULH v4.4s, v16.4s, v0.4s
- SQRDMULH v5.4s, v17.4s, v0.4s
- LD1R {v1.4s}, [x11], 4
- SQRDMULH v6.4s, v18.4s, v0.4s
- SQRDMULH v7.4s, v19.4s, v0.4s
- SQRDMULH v8.4s, v20.4s, v0.4s
- SQRDMULH v9.4s, v21.4s, v0.4s
- CMEQ v2.4s, v1.4s, 0
- SQRDMULH v10.4s, v22.4s, v0.4s
- SQRDMULH v11.4s, v23.4s, v0.4s
+ $if REQUANTIZATION == "GEMMLOWP":
+ # Apply params - scale, shift, bias and clamp
+ LD2R {v0.4s, v1.4s}, [x11], 8
+ CMEQ v2.4s, v1.4s, 0
- BIC v16.16b, v16.16b, v2.16b
- BIC v17.16b, v17.16b, v2.16b
- BIC v18.16b, v18.16b, v2.16b
- BIC v19.16b, v19.16b, v2.16b
- BIC v20.16b, v20.16b, v2.16b
- BIC v21.16b, v21.16b, v2.16b
- BIC v22.16b, v22.16b, v2.16b
- BIC v23.16b, v23.16b, v2.16b
+ BIC v4.16b, v16.16b, v2.16b
+ BIC v5.16b, v17.16b, v2.16b
+ BIC v6.16b, v18.16b, v2.16b
+ BIC v7.16b, v19.16b, v2.16b
- SSRA v4.4s, v16.4s, 31 // signed shift right accumulate
- SSRA v5.4s, v17.4s, 31
- SSRA v6.4s, v18.4s, 31
- SSRA v7.4s, v19.4s, 31
- SSRA v8.4s, v20.4s, 31
- SSRA v9.4s, v21.4s, 31
- SSRA v10.4s, v22.4s, 31
- SSRA v11.4s, v23.4s, 31
+ SQRDMULH v16.4s, v16.4s, v0.4s
+ SQRDMULH v17.4s, v17.4s, v0.4s
+ SQRDMULH v18.4s, v18.4s, v0.4s
+ SQRDMULH v19.4s, v19.4s, v0.4s
- SQRDMULH v16.4s, v24.4s, v0.4s
- SQRDMULH v17.4s, v25.4s, v0.4s
- SQRDMULH v18.4s, v26.4s, v0.4s
- SQRDMULH v19.4s, v27.4s, v0.4s
- SQRDMULH v20.4s, v28.4s, v0.4s
- SQRDMULH v21.4s, v29.4s, v0.4s
- SQRDMULH v22.4s, v30.4s, v0.4s
- SQRDMULH v23.4s, v31.4s, v0.4s
+ SSRA v16.4s, v4.4s, 31 // signed shift right accumulate
+ SSRA v17.4s, v5.4s, 31
+ SSRA v18.4s, v6.4s, 31
+ SSRA v19.4s, v7.4s, 31
- BIC v24.16b, v24.16b, v2.16b
- BIC v25.16b, v25.16b, v2.16b
- BIC v26.16b, v26.16b, v2.16b
- BIC v27.16b, v27.16b, v2.16b
- BIC v28.16b, v28.16b, v2.16b
- BIC v29.16b, v29.16b, v2.16b
- BIC v30.16b, v30.16b, v2.16b
- BIC v31.16b, v31.16b, v2.16b
+ BIC v4.16b, v20.16b, v2.16b
+ BIC v5.16b, v21.16b, v2.16b
+ BIC v6.16b, v22.16b, v2.16b
+ BIC v7.16b, v23.16b, v2.16b
- SSRA v16.4s, v24.4s, 31
- SSRA v17.4s, v25.4s, 31
- SSRA v18.4s, v26.4s, 31
- SSRA v19.4s, v27.4s, 31
- SSRA v20.4s, v28.4s, 31
- SSRA v21.4s, v29.4s, 31
- SSRA v22.4s, v30.4s, 31
- SSRA v23.4s, v31.4s, 31
+ SQRDMULH v20.4s, v20.4s, v0.4s
+ SQRDMULH v21.4s, v21.4s, v0.4s
+ SQRDMULH v22.4s, v22.4s, v0.4s
+ SQRDMULH v23.4s, v23.4s, v0.4s
- SRSHL v4.4s, v4.4s, v1.4s // signed rounding shift left
- SRSHL v5.4s, v5.4s, v1.4s
- SRSHL v6.4s, v6.4s, v1.4s
- SRSHL v7.4s, v7.4s, v1.4s
- SRSHL v8.4s, v8.4s, v1.4s
- SRSHL v9.4s, v9.4s, v1.4s
- SRSHL v10.4s, v10.4s, v1.4s
- SRSHL v11.4s, v11.4s, v1.4s
+ SSRA v20.4s, v4.4s, 31
+ SSRA v21.4s, v5.4s, 31
+ SSRA v22.4s, v6.4s, 31
+ SSRA v23.4s, v7.4s, 31
- SRSHL v16.4s, v16.4s, v1.4s
- SRSHL v17.4s, v17.4s, v1.4s
- SRSHL v18.4s, v18.4s, v1.4s
- SRSHL v19.4s, v19.4s, v1.4s
- SRSHL v20.4s, v20.4s, v1.4s
- SRSHL v21.4s, v21.4s, v1.4s
- SRSHL v22.4s, v22.4s, v1.4s
- SRSHL v23.4s, v23.4s, v1.4s
+ BIC v4.16b, v24.16b, v2.16b
+ BIC v5.16b, v25.16b, v2.16b
+ BIC v6.16b, v26.16b, v2.16b
+ BIC v7.16b, v27.16b, v2.16b
- SQXTN v4.4h, v4.4s
- SQXTN v5.4h, v5.4s
- SQXTN v6.4h, v6.4s
- SQXTN v7.4h, v7.4s
+ SQRDMULH v24.4s, v24.4s, v0.4s
+ SQRDMULH v25.4s, v25.4s, v0.4s
+ SQRDMULH v26.4s, v26.4s, v0.4s
+ SQRDMULH v27.4s, v27.4s, v0.4s
+
+ SSRA v24.4s, v4.4s, 31
+ SSRA v25.4s, v5.4s, 31
+ SSRA v26.4s, v6.4s, 31
+ SSRA v27.4s, v7.4s, 31
+
+ BIC v4.16b, v28.16b, v2.16b
+ BIC v5.16b, v29.16b, v2.16b
+ BIC v6.16b, v30.16b, v2.16b
+ BIC v7.16b, v31.16b, v2.16b
+
+ SQRDMULH v28.4s, v28.4s, v0.4s
+ SQRDMULH v29.4s, v29.4s, v0.4s
+ SQRDMULH v30.4s, v30.4s, v0.4s
+ SQRDMULH v31.4s, v31.4s, v0.4s
+
+ SSRA v28.4s, v4.4s, 31
+ SSRA v29.4s, v5.4s, 31
+ SSRA v30.4s, v6.4s, 31
+ SSRA v31.4s, v7.4s, 31
+
+ SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v1.4s
+ SRSHL v18.4s, v18.4s, v1.4s
+ SRSHL v19.4s, v19.4s, v1.4s
+ SRSHL v20.4s, v20.4s, v1.4s
+ SRSHL v21.4s, v21.4s, v1.4s
+ SRSHL v22.4s, v22.4s, v1.4s
+ SRSHL v23.4s, v23.4s, v1.4s
+ SRSHL v24.4s, v24.4s, v1.4s
+ SRSHL v25.4s, v25.4s, v1.4s
+ SRSHL v26.4s, v26.4s, v1.4s
+ SRSHL v27.4s, v27.4s, v1.4s
+ SRSHL v28.4s, v28.4s, v1.4s
+ SRSHL v29.4s, v29.4s, v1.4s
+ SRSHL v30.4s, v30.4s, v1.4s
+ SRSHL v31.4s, v31.4s, v1.4s
+ $elif REQUANTIZATION == "FP32":
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x11], 4
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v0.4s, v16.4s
+ FMUL v17.4s, v0.4s, v17.4s
+ FMUL v18.4s, v0.4s, v18.4s
+ FMUL v19.4s, v0.4s, v19.4s
+ FMUL v20.4s, v0.4s, v20.4s
+ FMUL v21.4s, v0.4s, v21.4s
+ FMUL v22.4s, v0.4s, v22.4s
+ FMUL v23.4s, v0.4s, v23.4s
+ FMUL v24.4s, v0.4s, v24.4s
+ FMUL v25.4s, v0.4s, v25.4s
+ FMUL v26.4s, v0.4s, v26.4s
+ FMUL v27.4s, v0.4s, v27.4s
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
SQXTN v16.4h, v16.4s
SQXTN v17.4h, v17.4s
SQXTN v18.4h, v18.4s
SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
LD1R {v2.8h}, [x11], 2 // add bias
- SQXTN2 v4.8h, v8.4s
- SQXTN2 v5.8h, v9.4s
- SQXTN2 v6.8h, v10.4s
- SQXTN2 v7.8h, v11.4s
SQXTN2 v16.8h, v20.4s
SQXTN2 v17.8h, v21.4s
SQXTN2 v18.8h, v22.4s
SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
- SQADD v4.8h, v4.8h, v2.8h
- SQADD v5.8h, v5.8h, v2.8h
- SQADD v6.8h, v6.8h, v2.8h
- SQADD v7.8h, v7.8h, v2.8h
SQADD v16.8h, v16.8h, v2.8h
SQADD v17.8h, v17.8h, v2.8h
SQADD v18.8h, v18.8h, v2.8h
SQADD v19.8h, v19.8h, v2.8h
+ SQADD v24.8h, v24.8h, v2.8h
+ SQADD v25.8h, v25.8h, v2.8h
+ SQADD v26.8h, v26.8h, v2.8h
+ SQADD v27.8h, v27.8h, v2.8h
LD1R {v0.16b}, [x11], 1 // clamp min value
- SQXTN v4.8b, v4.8h
- SQXTN v5.8b, v5.8h
- SQXTN v6.8b, v6.8h
- SQXTN v7.8b, v7.8h
+ SQXTN v4.8b, v16.8h
+ SQXTN v5.8b, v17.8h
+ SQXTN v6.8b, v18.8h
+ SQXTN v7.8b, v19.8h
LD1R {v1.16b}, [x11] // clamp max value
- SQXTN2 v4.16b, v16.8h
- SQXTN2 v5.16b, v17.8h
- SQXTN2 v6.16b, v18.8h
- SQXTN2 v7.16b, v19.8h
+ SQXTN2 v4.16b, v24.8h
+ SQXTN2 v5.16b, v25.8h
+ SQXTN2 v6.16b, v26.8h
+ SQXTN2 v7.16b, v27.8h
LDR x12, [sp, 32] // cn_stride
- SMAX v4.16b, v4.16b, v0.16b
- SMAX v5.16b, v5.16b, v0.16b
- SMAX v6.16b, v6.16b, v0.16b
- SMAX v7.16b, v7.16b, v0.16b
+ SMAX v4.16b, v4.16b, v0.16b
+ SMAX v5.16b, v5.16b, v0.16b
+ SMAX v6.16b, v6.16b, v0.16b
+ SMAX v7.16b, v7.16b, v0.16b
SUBS x1, x1, 16
- SMIN v4.16b, v4.16b, v1.16b
- SMIN v5.16b, v5.16b, v1.16b
- SMIN v6.16b, v6.16b, v1.16b
- SMIN v7.16b, v7.16b, v1.16b
+ SMIN v4.16b, v4.16b, v1.16b
+ SMIN v5.16b, v5.16b, v1.16b
+ SMIN v6.16b, v6.16b, v1.16b
+ SMIN v7.16b, v7.16b, v1.16b
B.LO 6f
# Store full 4 x 16
@@ -658,7 +721,7 @@
LDP d8, d9, [sp], 32
RET
-END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
+END_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in b/src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in
index 2ce8bfc..3cb9147 100644
--- a/src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in
+++ b/src/qs8-gemm/4x16c4-aarch64-neondot-ld128.S.in
@@ -21,7 +21,7 @@
# const union ${CONV_PARAMS} params) [sp + 8] -> x11
$if REQUANTIZATION == "GEMMLOWP":
- # params structure is 11 bytes
+ # params structure is 12 bytes
# struct {
# int32_t multiplier;
# int32_t right_shift;
diff --git a/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in b/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in
index b482c79..b843647 100644
--- a/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in
+++ b/src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in
@@ -5,7 +5,10 @@
#include <xnnpack/assembly.h>
-# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld32(
# size_t mr, x0
# size_t nc, x1
# size_t kc, x2 / x0
@@ -15,7 +18,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x12
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+# const union ${CONV_PARAMS} params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -31,7 +34,7 @@
# C3 x7 v19 v23 v27 v31
# unused v8 v9 v10 v11 v12 v13 v14 v15
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld32
# Clamp A and C pointers
CMP x0, 2 // if mr < 2
@@ -63,7 +66,7 @@
LDP q24, q28, [x5], 32
MOV v19.16b, v16.16b
MOV v21.16b, v20.16b
- LDR x11, [sp, 8] // params
+ LDR x11, [sp, 8] // reload params
MOV v22.16b, v20.16b
MOV v23.16b, v20.16b
MOV x0, x2 // k = kc. assumes kc > 0
@@ -104,86 +107,141 @@
SDOT v31.4s, v7.16b, v3.4b[0]
B.HI 1b
- # Apply params - scale, shift, bias and clamp
- LD2R {v0.4s, v1.4s}, [x11], 8
- CMEQ v2.4s, v1.4s, 0
+ $if REQUANTIZATION == "GEMMLOWP":
+ # Apply params - scale, shift, bias and clamp
+ LD2R {v0.4s, v1.4s}, [x11], 8
+ CMEQ v2.4s, v1.4s, 0
- BIC v4.16b, v16.16b, v2.16b
- BIC v5.16b, v17.16b, v2.16b
- BIC v6.16b, v18.16b, v2.16b
- BIC v7.16b, v19.16b, v2.16b
+ BIC v4.16b, v16.16b, v2.16b
+ BIC v5.16b, v17.16b, v2.16b
+ BIC v6.16b, v18.16b, v2.16b
+ BIC v7.16b, v19.16b, v2.16b
- SQRDMULH v16.4s, v16.4s, v0.4s
- SQRDMULH v17.4s, v17.4s, v0.4s
- SQRDMULH v18.4s, v18.4s, v0.4s
- SQRDMULH v19.4s, v19.4s, v0.4s
+ SQRDMULH v16.4s, v16.4s, v0.4s
+ SQRDMULH v17.4s, v17.4s, v0.4s
+ SQRDMULH v18.4s, v18.4s, v0.4s
+ SQRDMULH v19.4s, v19.4s, v0.4s
- SSRA v16.4s, v4.4s, 31 // signed shift right accumulate
- SSRA v17.4s, v5.4s, 31
- SSRA v18.4s, v6.4s, 31
- SSRA v19.4s, v7.4s, 31
+ SSRA v16.4s, v4.4s, 31 // signed shift right accumulate
+ SSRA v17.4s, v5.4s, 31
+ SSRA v18.4s, v6.4s, 31
+ SSRA v19.4s, v7.4s, 31
- BIC v4.16b, v20.16b, v2.16b
- BIC v5.16b, v21.16b, v2.16b
- BIC v6.16b, v22.16b, v2.16b
- BIC v7.16b, v23.16b, v2.16b
+ BIC v4.16b, v20.16b, v2.16b
+ BIC v5.16b, v21.16b, v2.16b
+ BIC v6.16b, v22.16b, v2.16b
+ BIC v7.16b, v23.16b, v2.16b
- SQRDMULH v20.4s, v20.4s, v0.4s
- SQRDMULH v21.4s, v21.4s, v0.4s
- SQRDMULH v22.4s, v22.4s, v0.4s
- SQRDMULH v23.4s, v23.4s, v0.4s
+ SQRDMULH v20.4s, v20.4s, v0.4s
+ SQRDMULH v21.4s, v21.4s, v0.4s
+ SQRDMULH v22.4s, v22.4s, v0.4s
+ SQRDMULH v23.4s, v23.4s, v0.4s
- SSRA v20.4s, v4.4s, 31
- SSRA v21.4s, v5.4s, 31
- SSRA v22.4s, v6.4s, 31
- SSRA v23.4s, v7.4s, 31
+ SSRA v20.4s, v4.4s, 31
+ SSRA v21.4s, v5.4s, 31
+ SSRA v22.4s, v6.4s, 31
+ SSRA v23.4s, v7.4s, 31
- BIC v4.16b, v24.16b, v2.16b
- BIC v5.16b, v25.16b, v2.16b
- BIC v6.16b, v26.16b, v2.16b
- BIC v7.16b, v27.16b, v2.16b
+ BIC v4.16b, v24.16b, v2.16b
+ BIC v5.16b, v25.16b, v2.16b
+ BIC v6.16b, v26.16b, v2.16b
+ BIC v7.16b, v27.16b, v2.16b
- SQRDMULH v24.4s, v24.4s, v0.4s
- SQRDMULH v25.4s, v25.4s, v0.4s
- SQRDMULH v26.4s, v26.4s, v0.4s
- SQRDMULH v27.4s, v27.4s, v0.4s
+ SQRDMULH v24.4s, v24.4s, v0.4s
+ SQRDMULH v25.4s, v25.4s, v0.4s
+ SQRDMULH v26.4s, v26.4s, v0.4s
+ SQRDMULH v27.4s, v27.4s, v0.4s
- SSRA v24.4s, v4.4s, 31
- SSRA v25.4s, v5.4s, 31
- SSRA v26.4s, v6.4s, 31
- SSRA v27.4s, v7.4s, 31
+ SSRA v24.4s, v4.4s, 31
+ SSRA v25.4s, v5.4s, 31
+ SSRA v26.4s, v6.4s, 31
+ SSRA v27.4s, v7.4s, 31
- BIC v4.16b, v28.16b, v2.16b
- BIC v5.16b, v29.16b, v2.16b
- BIC v6.16b, v30.16b, v2.16b
- BIC v7.16b, v31.16b, v2.16b
+ BIC v4.16b, v28.16b, v2.16b
+ BIC v5.16b, v29.16b, v2.16b
+ BIC v6.16b, v30.16b, v2.16b
+ BIC v7.16b, v31.16b, v2.16b
- SQRDMULH v28.4s, v28.4s, v0.4s
- SQRDMULH v29.4s, v29.4s, v0.4s
- SQRDMULH v30.4s, v30.4s, v0.4s
- SQRDMULH v31.4s, v31.4s, v0.4s
+ SQRDMULH v28.4s, v28.4s, v0.4s
+ SQRDMULH v29.4s, v29.4s, v0.4s
+ SQRDMULH v30.4s, v30.4s, v0.4s
+ SQRDMULH v31.4s, v31.4s, v0.4s
- SSRA v28.4s, v4.4s, 31
- SSRA v29.4s, v5.4s, 31
- SSRA v30.4s, v6.4s, 31
- SSRA v31.4s, v7.4s, 31
+ SSRA v28.4s, v4.4s, 31
+ SSRA v29.4s, v5.4s, 31
+ SSRA v30.4s, v6.4s, 31
+ SSRA v31.4s, v7.4s, 31
- SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left
- SRSHL v17.4s, v17.4s, v1.4s
- SRSHL v18.4s, v18.4s, v1.4s
- SRSHL v19.4s, v19.4s, v1.4s
- SRSHL v20.4s, v20.4s, v1.4s
- SRSHL v21.4s, v21.4s, v1.4s
- SRSHL v22.4s, v22.4s, v1.4s
- SRSHL v23.4s, v23.4s, v1.4s
- SRSHL v24.4s, v24.4s, v1.4s
- SRSHL v25.4s, v25.4s, v1.4s
- SRSHL v26.4s, v26.4s, v1.4s
- SRSHL v27.4s, v27.4s, v1.4s
- SRSHL v28.4s, v28.4s, v1.4s
- SRSHL v29.4s, v29.4s, v1.4s
- SRSHL v30.4s, v30.4s, v1.4s
- SRSHL v31.4s, v31.4s, v1.4s
+ SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v1.4s
+ SRSHL v18.4s, v18.4s, v1.4s
+ SRSHL v19.4s, v19.4s, v1.4s
+ SRSHL v20.4s, v20.4s, v1.4s
+ SRSHL v21.4s, v21.4s, v1.4s
+ SRSHL v22.4s, v22.4s, v1.4s
+ SRSHL v23.4s, v23.4s, v1.4s
+ SRSHL v24.4s, v24.4s, v1.4s
+ SRSHL v25.4s, v25.4s, v1.4s
+ SRSHL v26.4s, v26.4s, v1.4s
+ SRSHL v27.4s, v27.4s, v1.4s
+ SRSHL v28.4s, v28.4s, v1.4s
+ SRSHL v29.4s, v29.4s, v1.4s
+ SRSHL v30.4s, v30.4s, v1.4s
+ SRSHL v31.4s, v31.4s, v1.4s
+ $elif REQUANTIZATION == "FP32":
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x11], 4
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v0.4s, v16.4s
+ FMUL v17.4s, v0.4s, v17.4s
+ FMUL v18.4s, v0.4s, v18.4s
+ FMUL v19.4s, v0.4s, v19.4s
+ FMUL v20.4s, v0.4s, v20.4s
+ FMUL v21.4s, v0.4s, v21.4s
+ FMUL v22.4s, v0.4s, v22.4s
+ FMUL v23.4s, v0.4s, v23.4s
+ FMUL v24.4s, v0.4s, v24.4s
+ FMUL v25.4s, v0.4s, v25.4s
+ FMUL v26.4s, v0.4s, v26.4s
+ FMUL v27.4s, v0.4s, v27.4s
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
SQXTN v16.4h, v16.4s
SQXTN v17.4h, v17.4s
@@ -289,7 +347,7 @@
6:
RET
-END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32
+END_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld32
#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in b/src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in
index a87d160..c0ba8d8 100644
--- a/src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in
+++ b/src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in
@@ -5,7 +5,10 @@
#include <xnnpack/assembly.h>
-# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld64(
# size_t mr, x0
# size_t nc, x1
# size_t kc, x2 / x0
@@ -15,7 +18,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x12
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+# const union ${CONV_PARAMS} params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -31,7 +34,7 @@
# C3 x7 v19 v23 v27 v31
# unused v8 v9 v10 v11 v12 v13 v14 v15
-BEGIN_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld64
# Clamp A and C pointers
CMP x0, 2 // if mr < 2
@@ -127,86 +130,141 @@
TBNZ x0, 2, 3f
2:
- # Apply params - scale, shift, bias and clamp
- LD2R {v0.4s, v1.4s}, [x11], 8
- CMEQ v2.4s, v1.4s, 0
+ $if REQUANTIZATION == "GEMMLOWP":
+ # Apply params - scale, shift, bias and clamp
+ LD2R {v0.4s, v1.4s}, [x11], 8
+ CMEQ v2.4s, v1.4s, 0
- BIC v4.16b, v16.16b, v2.16b
- BIC v5.16b, v17.16b, v2.16b
- BIC v6.16b, v18.16b, v2.16b
- BIC v7.16b, v19.16b, v2.16b
+ BIC v4.16b, v16.16b, v2.16b
+ BIC v5.16b, v17.16b, v2.16b
+ BIC v6.16b, v18.16b, v2.16b
+ BIC v7.16b, v19.16b, v2.16b
- SQRDMULH v16.4s, v16.4s, v0.4s
- SQRDMULH v17.4s, v17.4s, v0.4s
- SQRDMULH v18.4s, v18.4s, v0.4s
- SQRDMULH v19.4s, v19.4s, v0.4s
+ SQRDMULH v16.4s, v16.4s, v0.4s
+ SQRDMULH v17.4s, v17.4s, v0.4s
+ SQRDMULH v18.4s, v18.4s, v0.4s
+ SQRDMULH v19.4s, v19.4s, v0.4s
- SSRA v16.4s, v4.4s, 31 // signed shift right accumulate
- SSRA v17.4s, v5.4s, 31
- SSRA v18.4s, v6.4s, 31
- SSRA v19.4s, v7.4s, 31
+ SSRA v16.4s, v4.4s, 31 // signed shift right accumulate
+ SSRA v17.4s, v5.4s, 31
+ SSRA v18.4s, v6.4s, 31
+ SSRA v19.4s, v7.4s, 31
- BIC v4.16b, v20.16b, v2.16b
- BIC v5.16b, v21.16b, v2.16b
- BIC v6.16b, v22.16b, v2.16b
- BIC v7.16b, v23.16b, v2.16b
+ BIC v4.16b, v20.16b, v2.16b
+ BIC v5.16b, v21.16b, v2.16b
+ BIC v6.16b, v22.16b, v2.16b
+ BIC v7.16b, v23.16b, v2.16b
- SQRDMULH v20.4s, v20.4s, v0.4s
- SQRDMULH v21.4s, v21.4s, v0.4s
- SQRDMULH v22.4s, v22.4s, v0.4s
- SQRDMULH v23.4s, v23.4s, v0.4s
+ SQRDMULH v20.4s, v20.4s, v0.4s
+ SQRDMULH v21.4s, v21.4s, v0.4s
+ SQRDMULH v22.4s, v22.4s, v0.4s
+ SQRDMULH v23.4s, v23.4s, v0.4s
- SSRA v20.4s, v4.4s, 31
- SSRA v21.4s, v5.4s, 31
- SSRA v22.4s, v6.4s, 31
- SSRA v23.4s, v7.4s, 31
+ SSRA v20.4s, v4.4s, 31
+ SSRA v21.4s, v5.4s, 31
+ SSRA v22.4s, v6.4s, 31
+ SSRA v23.4s, v7.4s, 31
- BIC v4.16b, v24.16b, v2.16b
- BIC v5.16b, v25.16b, v2.16b
- BIC v6.16b, v26.16b, v2.16b
- BIC v7.16b, v27.16b, v2.16b
+ BIC v4.16b, v24.16b, v2.16b
+ BIC v5.16b, v25.16b, v2.16b
+ BIC v6.16b, v26.16b, v2.16b
+ BIC v7.16b, v27.16b, v2.16b
- SQRDMULH v24.4s, v24.4s, v0.4s
- SQRDMULH v25.4s, v25.4s, v0.4s
- SQRDMULH v26.4s, v26.4s, v0.4s
- SQRDMULH v27.4s, v27.4s, v0.4s
+ SQRDMULH v24.4s, v24.4s, v0.4s
+ SQRDMULH v25.4s, v25.4s, v0.4s
+ SQRDMULH v26.4s, v26.4s, v0.4s
+ SQRDMULH v27.4s, v27.4s, v0.4s
- SSRA v24.4s, v4.4s, 31
- SSRA v25.4s, v5.4s, 31
- SSRA v26.4s, v6.4s, 31
- SSRA v27.4s, v7.4s, 31
+ SSRA v24.4s, v4.4s, 31
+ SSRA v25.4s, v5.4s, 31
+ SSRA v26.4s, v6.4s, 31
+ SSRA v27.4s, v7.4s, 31
- BIC v4.16b, v28.16b, v2.16b
- BIC v5.16b, v29.16b, v2.16b
- BIC v6.16b, v30.16b, v2.16b
- BIC v7.16b, v31.16b, v2.16b
+ BIC v4.16b, v28.16b, v2.16b
+ BIC v5.16b, v29.16b, v2.16b
+ BIC v6.16b, v30.16b, v2.16b
+ BIC v7.16b, v31.16b, v2.16b
- SQRDMULH v28.4s, v28.4s, v0.4s
- SQRDMULH v29.4s, v29.4s, v0.4s
- SQRDMULH v30.4s, v30.4s, v0.4s
- SQRDMULH v31.4s, v31.4s, v0.4s
+ SQRDMULH v28.4s, v28.4s, v0.4s
+ SQRDMULH v29.4s, v29.4s, v0.4s
+ SQRDMULH v30.4s, v30.4s, v0.4s
+ SQRDMULH v31.4s, v31.4s, v0.4s
- SSRA v28.4s, v4.4s, 31
- SSRA v29.4s, v5.4s, 31
- SSRA v30.4s, v6.4s, 31
- SSRA v31.4s, v7.4s, 31
+ SSRA v28.4s, v4.4s, 31
+ SSRA v29.4s, v5.4s, 31
+ SSRA v30.4s, v6.4s, 31
+ SSRA v31.4s, v7.4s, 31
- SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left
- SRSHL v17.4s, v17.4s, v1.4s
- SRSHL v18.4s, v18.4s, v1.4s
- SRSHL v19.4s, v19.4s, v1.4s
- SRSHL v20.4s, v20.4s, v1.4s
- SRSHL v21.4s, v21.4s, v1.4s
- SRSHL v22.4s, v22.4s, v1.4s
- SRSHL v23.4s, v23.4s, v1.4s
- SRSHL v24.4s, v24.4s, v1.4s
- SRSHL v25.4s, v25.4s, v1.4s
- SRSHL v26.4s, v26.4s, v1.4s
- SRSHL v27.4s, v27.4s, v1.4s
- SRSHL v28.4s, v28.4s, v1.4s
- SRSHL v29.4s, v29.4s, v1.4s
- SRSHL v30.4s, v30.4s, v1.4s
- SRSHL v31.4s, v31.4s, v1.4s
+ SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v1.4s
+ SRSHL v18.4s, v18.4s, v1.4s
+ SRSHL v19.4s, v19.4s, v1.4s
+ SRSHL v20.4s, v20.4s, v1.4s
+ SRSHL v21.4s, v21.4s, v1.4s
+ SRSHL v22.4s, v22.4s, v1.4s
+ SRSHL v23.4s, v23.4s, v1.4s
+ SRSHL v24.4s, v24.4s, v1.4s
+ SRSHL v25.4s, v25.4s, v1.4s
+ SRSHL v26.4s, v26.4s, v1.4s
+ SRSHL v27.4s, v27.4s, v1.4s
+ SRSHL v28.4s, v28.4s, v1.4s
+ SRSHL v29.4s, v29.4s, v1.4s
+ SRSHL v30.4s, v30.4s, v1.4s
+ SRSHL v31.4s, v31.4s, v1.4s
+ $elif REQUANTIZATION == "FP32":
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x11], 4
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v0.4s, v16.4s
+ FMUL v17.4s, v0.4s, v17.4s
+ FMUL v18.4s, v0.4s, v18.4s
+ FMUL v19.4s, v0.4s, v19.4s
+ FMUL v20.4s, v0.4s, v20.4s
+ FMUL v21.4s, v0.4s, v21.4s
+ FMUL v22.4s, v0.4s, v22.4s
+ FMUL v23.4s, v0.4s, v23.4s
+ FMUL v24.4s, v0.4s, v24.4s
+ FMUL v25.4s, v0.4s, v25.4s
+ FMUL v26.4s, v0.4s, v26.4s
+ FMUL v27.4s, v0.4s, v27.4s
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
SQXTN v16.4h, v16.4s
SQXTN v17.4h, v17.4s
@@ -340,7 +398,7 @@
8:
RET
-END_FUNCTION xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+END_FUNCTION xnn_qs8_gemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld64
#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S b/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S
new file mode 100644
index 0000000..598355e
--- /dev/null
+++ b/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld32.S
@@ -0,0 +1,122 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/1x16c4-aarch64-neondot-ld32.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, (x4)
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, (x7)
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# B x5 v16 v17 v18 v19
+# C0 x6 v28 v29 v30 v31
+# unused v4 v5 v6 v7 v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32
+0:
+ # Load initial bias from w into accumulators
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+ LDP q28, q29, [x5], 32
+ BIC x2, x2, 3
+ LDP q30, q31, [x5], 32
+ MOV x0, x2 // k = kc. assumes kc > 0
+ LDR x11, [sp, 8] // params
+
+ # Main loop - 4 bytes of A
+ .p2align 3
+1:
+ LDR s0, [x3], 4
+ LDR q16, [x5], 16
+ LDR q17, [x5], 16
+ LDR q18, [x5], 16
+ LDR q19, [x5], 16
+ SDOT v28.4s, v16.16b, v0.4b[0]
+ SDOT v29.4s, v17.16b, v0.4b[0]
+ SUBS x0, x0, 4
+ SDOT v30.4s, v18.16b, v0.4b[0]
+ SDOT v31.4s, v19.16b, v0.4b[0]
+ B.HI 1b
+
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x11], 4
+
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ LD1R {v2.8h}, [x11], 2 // add bias
+ SQXTN v4.4h, v28.4s
+ SQXTN v6.4h, v30.4s
+ SQXTN2 v4.8h, v29.4s
+ SQXTN2 v6.8h, v31.4s
+ LD2R {v0.16b, v1.16b}, [x11] // clamp to min/max
+ SQADD v4.8h, v4.8h, v2.8h
+ SQADD v6.8h, v6.8h, v2.8h
+ LDR x12, [sp] // cn_stride
+ SQXTN v4.8b, v4.8h
+ SQXTN2 v4.16b, v6.8h
+ SUBS x1, x1, 16
+ SMAX v4.16b, v4.16b, v0.16b
+ SMIN v4.16b, v4.16b, v1.16b
+ B.LO 2f
+
+ # Store full 1 x 16
+ ST1 {v4.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ B.NE 0b
+ RET
+
+ # Store odd width
+ .p2align 3
+2:
+ TBZ x1, 3, 3f
+ STR d4, [x6], 8
+ DUP d4, v4.d[1]
+3:
+ TBZ x1, 2, 4f
+ STR s4, [x6], 4
+ DUP s4, v4.s[1]
+4:
+ TBZ x1, 1, 5f
+ ST1 {v4.h}[0], [x6], 2
+ DUP h4, v4.h[1]
+5:
+ TBZ x1, 0, 6f
+ ST1 {v4.b}[0], [x6]
+6:
+ RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S b/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S
new file mode 100644
index 0000000..2714f37
--- /dev/null
+++ b/src/qs8-gemm/gen/1x16c4-minmax-fp32-aarch64-neondot-ld64.S
@@ -0,0 +1,158 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/1x16c4-aarch64-neondot-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, (x4)
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, (x7)
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# B x5 v4 v5 v6 v7 v16 v17 v18 v19
+# C0 x6 v28 v29 v30 v31
+# unused v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+ BIC x2, x2, 3
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q28, q29, [x5], 32
+ SUBS x0, x2, 8 // k = kc - 8
+ LDP q30, q31, [x5], 32
+ LDR x11, [sp, 8] // params
+
+ # Is there at least 8 bytes?
+ B.LO 3f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+1:
+ LDR d0, [x3], 8
+ LDR q16, [x5, 0]
+ LDR q17, [x5, 16]
+ SDOT v28.4s, v16.16b, v0.4b[0]
+ LDR q18, [x5, 32]
+ SDOT v29.4s, v17.16b, v0.4b[0]
+ LDR q19, [x5, 48]
+ SDOT v30.4s, v18.16b, v0.4b[0]
+ LDR q4, [x5, 64]
+ SDOT v31.4s, v19.16b, v0.4b[0]
+ LDR q5, [x5, 80]
+ SDOT v28.4s, v4.16b, v0.4b[1]
+ LDR q6, [x5, 96]
+ SDOT v29.4s, v5.16b, v0.4b[1]
+ LDR q7, [x5, 112]
+ SDOT v30.4s, v6.16b, v0.4b[1]
+ ADD x5, x5, 128
+ SDOT v31.4s, v7.16b, v0.4b[1]
+ SUBS x0, x0, 8
+ B.HS 1b
+
+ # Is there a remainder?- 1 to 4 bytes of A
+ TBNZ x0, 2, 3f
+
+2:
+
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x11], 4
+
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ LD1R {v2.8h}, [x11], 2 // add bias
+ SQXTN v4.4h, v28.4s
+ SQXTN v6.4h, v30.4s
+ SQXTN2 v4.8h, v29.4s
+ SQXTN2 v6.8h, v31.4s
+
+ LD2R {v0.16b, v1.16b}, [x11] // clamp to min/max
+ SQADD v4.8h, v4.8h, v2.8h
+ SQADD v6.8h, v6.8h, v2.8h
+ LDR x12, [sp] // cn_stride
+ SQXTN v4.8b, v4.8h
+ SQXTN2 v4.16b, v6.8h
+ SUBS x1, x1, 16
+ SMAX v4.16b, v4.16b, v0.16b
+ SMIN v4.16b, v4.16b, v1.16b
+ B.LO 4f
+
+ # Store full 1 x 16
+ ST1 {v4.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ B.NE 0b
+
+ RET
+
+ # Remainder - 4 bytes of A
+ .p2align 3
+3:
+ LDR s0, [x3], 4
+ LDR q16, [x5, 0]
+ LDR q17, [x5, 16]
+ SDOT v28.4s, v16.16b, v0.4b[0]
+ LDR q18, [x5, 32]
+ SDOT v29.4s, v17.16b, v0.4b[0]
+ LDR q19, [x5, 48]
+ SDOT v30.4s, v18.16b, v0.4b[0]
+ ADD x5, x5, 64
+ SDOT v31.4s, v19.16b, v0.4b[0]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+4:
+ TBZ x1, 3, 5f
+ STR d4, [x6], 8
+ DUP d4, v4.d[1]
+5:
+ TBZ x1, 2, 6f
+ STR s4, [x6], 4
+ DUP s4, v4.s[1]
+6:
+ TBZ x1, 1, 7f
+ ST1 {v4.h}[0], [x6], 2
+ DUP h4, v4.h[1]
+7:
+ TBZ x1, 0, 8f
+ ST1 {v4.b}[0], [x6]
+8:
+ RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S b/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
index 0187c61..c2dac5d 100644
--- a/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
+++ b/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
@@ -9,6 +9,7 @@
#include <xnnpack/assembly.h>
+
# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32(
# size_t mr, x0
# size_t nc, x1
@@ -19,7 +20,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x12
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+# const union xnn_qs8_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -57,27 +58,32 @@
# Apply params - scale, shift, bias and clamp
LD2R {v0.4s, v1.4s}, [x11], 8
CMEQ v2.4s, v1.4s, 0
- SQRDMULH v4.4s, v28.4s, v0.4s
- SQRDMULH v5.4s, v29.4s, v0.4s
- SQRDMULH v6.4s, v30.4s, v0.4s
- SQRDMULH v7.4s, v31.4s, v0.4s
- BIC v28.16b, v28.16b, v2.16b
- BIC v29.16b, v29.16b, v2.16b
- BIC v30.16b, v30.16b, v2.16b
- BIC v31.16b, v31.16b, v2.16b
- SSRA v4.4s, v28.4s, 31 // signed shift right accumulate
- SSRA v5.4s, v29.4s, 31
- SSRA v6.4s, v30.4s, 31
- SSRA v7.4s, v31.4s, 31
- SRSHL v4.4s, v4.4s, v1.4s // signed rounding shift left
- SRSHL v5.4s, v5.4s, v1.4s
- SRSHL v6.4s, v6.4s, v1.4s
- SRSHL v7.4s, v7.4s, v1.4s
+
+ BIC v4.16b, v28.16b, v2.16b
+ BIC v5.16b, v29.16b, v2.16b
+ BIC v6.16b, v30.16b, v2.16b
+ BIC v7.16b, v31.16b, v2.16b
+
+ SQRDMULH v28.4s, v28.4s, v0.4s
+ SQRDMULH v29.4s, v29.4s, v0.4s
+ SQRDMULH v30.4s, v30.4s, v0.4s
+ SQRDMULH v31.4s, v31.4s, v0.4s
+
+ SSRA v28.4s, v4.4s, 31
+ SSRA v29.4s, v5.4s, 31
+ SSRA v30.4s, v6.4s, 31
+ SSRA v31.4s, v7.4s, 31
+
+ SRSHL v28.4s, v28.4s, v1.4s // signed rounding shift left
+ SRSHL v29.4s, v29.4s, v1.4s
+ SRSHL v30.4s, v30.4s, v1.4s
+ SRSHL v31.4s, v31.4s, v1.4s
+
LD1R {v2.8h}, [x11], 2 // add bias
- SQXTN v4.4h, v4.4s
- SQXTN v6.4h, v6.4s
- SQXTN2 v4.8h, v5.4s
- SQXTN2 v6.8h, v7.4s
+ SQXTN v4.4h, v28.4s
+ SQXTN v6.4h, v30.4s
+ SQXTN2 v4.8h, v29.4s
+ SQXTN2 v6.8h, v31.4s
LD2R {v0.16b, v1.16b}, [x11] // clamp to min/max
SQADD v4.8h, v4.8h, v2.8h
SQADD v6.8h, v6.8h, v2.8h
diff --git a/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S b/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
index 6c88bcd..bcb7064 100644
--- a/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+++ b/src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
@@ -9,6 +9,7 @@
#include <xnnpack/assembly.h>
+
# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64(
# size_t mr, x0
# size_t nc, x1
@@ -19,7 +20,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, (x7)
# size_t cn_stride, [sp] -> x12
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+# const union xnn_qs8_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -72,30 +73,37 @@
TBNZ x0, 2, 3f
2:
- # Apply params - scale, shift, bias and clamp
+
+ # Apply params - scale, shift, bias and clamp
LD2R {v0.4s, v1.4s}, [x11], 8
- SQRDMULH v4.4s, v28.4s, v0.4s
- SQRDMULH v5.4s, v29.4s, v0.4s
CMEQ v2.4s, v1.4s, 0
- SQRDMULH v6.4s, v30.4s, v0.4s
- SQRDMULH v7.4s, v31.4s, v0.4s
- BIC v28.16b, v28.16b, v2.16b
- BIC v29.16b, v29.16b, v2.16b
- BIC v30.16b, v30.16b, v2.16b
- BIC v31.16b, v31.16b, v2.16b
- SSRA v4.4s, v28.4s, 31 // signed shift right accumulate
- SSRA v5.4s, v29.4s, 31
- SSRA v6.4s, v30.4s, 31
- SSRA v7.4s, v31.4s, 31
- SRSHL v4.4s, v4.4s, v1.4s // signed rounding shift left
- SRSHL v5.4s, v5.4s, v1.4s
- SRSHL v6.4s, v6.4s, v1.4s
- SRSHL v7.4s, v7.4s, v1.4s
+
+ BIC v4.16b, v28.16b, v2.16b
+ BIC v5.16b, v29.16b, v2.16b
+ BIC v6.16b, v30.16b, v2.16b
+ BIC v7.16b, v31.16b, v2.16b
+
+ SQRDMULH v28.4s, v28.4s, v0.4s
+ SQRDMULH v29.4s, v29.4s, v0.4s
+ SQRDMULH v30.4s, v30.4s, v0.4s
+ SQRDMULH v31.4s, v31.4s, v0.4s
+
+ SSRA v28.4s, v4.4s, 31
+ SSRA v29.4s, v5.4s, 31
+ SSRA v30.4s, v6.4s, 31
+ SSRA v31.4s, v7.4s, 31
+
+ SRSHL v28.4s, v28.4s, v1.4s // signed rounding shift left
+ SRSHL v29.4s, v29.4s, v1.4s
+ SRSHL v30.4s, v30.4s, v1.4s
+ SRSHL v31.4s, v31.4s, v1.4s
+
LD1R {v2.8h}, [x11], 2 // add bias
- SQXTN v4.4h, v4.4s
- SQXTN v6.4h, v6.4s
- SQXTN2 v4.8h, v5.4s
- SQXTN2 v6.8h, v7.4s
+ SQXTN v4.4h, v28.4s
+ SQXTN v6.4h, v30.4s
+ SQXTN2 v4.8h, v29.4s
+ SQXTN2 v6.8h, v31.4s
+
LD2R {v0.16b, v1.16b}, [x11] // clamp to min/max
SQADD v4.8h, v4.8h, v2.8h
SQADD v6.8h, v6.8h, v2.8h
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S b/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
new file mode 100644
index 0000000..a270fa14
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
@@ -0,0 +1,648 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0 v4
+# A1 x15 v1 v5
+# A2 x13 v2 v6
+# A3 x4 v3 v7
+# B x5 v8 v9 v10 v11
+# C0 x6 v16 v20 v24 v28
+# C1 x8 v17 v21 v25 v29
+# C2 x9 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ STP d8, d9, [sp, -32]!
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ STP d10, d11, [sp, 16]
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+ BIC x2, x2, 3
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ LDR x11, [sp, 40] // reload params
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ SUBS x0, x2, 16 // k = kc - 16
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ # Is there at least 16 bytes for prologue/epilogue?
+ B.LO 4f
+
+ # prologue - read A and B values for block 0 and 1
+ LDR d0, [x3], 8
+ LDR q8, [x5], 16
+ LDR d1, [x15], 8
+ LDR d2, [x13], 8
+ LDR d3, [x4], 8
+ SUBS x0, x0, 16 // is there 16 for main loop?
+ LDR d9, [x5], 8
+ LDR x14, [x5], 8
+ # Is there at least 16 bytes for main loop?
+ B.LO 2f
+
+ # Main loop - 16 bytes of A in 4 groups.
+ # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
+ # 4 LD64 for A
+ # 4 LD128 for W. = 2 LD64 + INS.
+ # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.
+
+ .p2align 3
+1:
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v0.4b[0]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v1.4b[0]
+ INS v9.d[1], x14
+ SDOT v18.4s, v8.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ SDOT v19.4s, v8.16b, v3.4b[0]
+ LDR d4, [x3], 8
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v0.4b[0]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v1.4b[0]
+ INS v10.d[1], x14
+ SDOT v22.4s, v9.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ SDOT v23.4s, v9.16b, v3.4b[0]
+ LDR d5, [x15], 8
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v0.4b[0]
+ LDR d8, [x5], 8
+ SDOT v25.4s, v10.16b, v1.4b[0]
+ INS v11.d[1], x14
+ SDOT v26.4s, v10.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ SDOT v27.4s, v10.16b, v3.4b[0]
+ LDR d6, [x13], 8
+
+ # BLOCK 3
+ SDOT v28.4s, v11.16b, v0.4b[0]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v1.4b[0]
+ INS v8.d[1], x14
+ SDOT v30.4s, v11.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ SDOT v31.4s, v11.16b, v3.4b[0]
+ LDR d7, [x4], 8
+
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v0.4b[1]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v1.4b[1]
+ INS v9.d[1], x14
+ SDOT v18.4s, v8.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ SDOT v19.4s, v8.16b, v3.4b[1]
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v0.4b[1]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v1.4b[1]
+ INS v10.d[1], x14
+ SDOT v22.4s, v9.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ SDOT v23.4s, v9.16b, v3.4b[1]
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v0.4b[1]
+ LDR d8, [x5], 8
+ SDOT v25.4s, v10.16b, v1.4b[1]
+ INS v11.d[1], x14
+ SDOT v26.4s, v10.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ SDOT v27.4s, v10.16b, v3.4b[1]
+
+ # BLOCK 4
+ SDOT v28.4s, v11.16b, v0.4b[1]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v1.4b[1]
+ INS v8.d[1], x14
+ SDOT v30.4s, v11.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ SDOT v31.4s, v11.16b, v3.4b[1]
+
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v4.4b[0]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v5.4b[0]
+ INS v9.d[1], x14
+ SDOT v18.4s, v8.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ SDOT v19.4s, v8.16b, v7.4b[0]
+ LDR d0, [x3], 8
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v4.4b[0]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v5.4b[0]
+ INS v10.d[1], x14
+ SDOT v22.4s, v9.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ SDOT v23.4s, v9.16b, v7.4b[0]
+ LDR d1, [x15], 8
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v4.4b[0]
+ LDR d8, [x5], 8
+ SDOT v25.4s, v10.16b, v5.4b[0]
+ INS v11.d[1], x14
+ SDOT v26.4s, v10.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ SDOT v27.4s, v10.16b, v7.4b[0]
+ LDR d2, [x13], 8
+
+ # BLOCK 3
+ SDOT v28.4s, v11.16b, v4.4b[0]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v5.4b[0]
+ INS v8.d[1], x14
+ SDOT v30.4s, v11.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ SDOT v31.4s, v11.16b, v7.4b[0]
+ LDR d3, [x4], 8
+
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v4.4b[1]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v5.4b[1]
+ INS v9.d[1], x14
+ SDOT v18.4s, v8.16b, v6.4b[1]
+ LDR x14, [x5], 8
+ SDOT v19.4s, v8.16b, v7.4b[1]
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v4.4b[1]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v5.4b[1]
+ INS v10.d[1], x14
+ SDOT v22.4s, v9.16b, v6.4b[1]
+ LDR x14, [x5], 8
+ SDOT v23.4s, v9.16b, v7.4b[1]
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v4.4b[1]
+ LDR d8, [x5], 8 // First B values for block 0 and 1
+ SDOT v25.4s, v10.16b, v5.4b[1]
+ INS v11.d[1], x14
+ SDOT v26.4s, v10.16b, v6.4b[1]
+ LDR x14, [x5], 8
+ SDOT v27.4s, v10.16b, v7.4b[1]
+ SUBS x0, x0, 16
+
+ # BLOCK 3
+ SDOT v28.4s, v11.16b, v4.4b[1]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v5.4b[1]
+ INS v8.d[1], x14
+ SDOT v30.4s, v11.16b, v6.4b[1]
+ LDR x14, [x5], 8
+ SDOT v31.4s, v11.16b, v7.4b[1]
+ B.HS 1b
+
+ # Epilogue. Same as main loop but no preloads in final group
+2:
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v0.4b[0]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v1.4b[0]
+ INS v9.d[1], x14
+ SDOT v18.4s, v8.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ SDOT v19.4s, v8.16b, v3.4b[0]
+ LDR d4, [x3], 8
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v0.4b[0]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v1.4b[0]
+ INS v10.d[1], x14
+ SDOT v22.4s, v9.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ SDOT v23.4s, v9.16b, v3.4b[0]
+ LDR d5, [x15], 8
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v0.4b[0]
+ LDR d8, [x5], 8
+ SDOT v25.4s, v10.16b, v1.4b[0]
+ INS v11.d[1], x14
+ SDOT v26.4s, v10.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ SDOT v27.4s, v10.16b, v3.4b[0]
+ LDR d6, [x13], 8
+
+ # BLOCK 3
+ SDOT v28.4s, v11.16b, v0.4b[0]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v1.4b[0]
+ INS v8.d[1], x14
+ SDOT v30.4s, v11.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ SDOT v31.4s, v11.16b, v3.4b[0]
+ LDR d7, [x4], 8
+
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v0.4b[1]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v1.4b[1]
+ INS v9.d[1], x14
+ SDOT v18.4s, v8.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ SDOT v19.4s, v8.16b, v3.4b[1]
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v0.4b[1]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v1.4b[1]
+ INS v10.d[1], x14
+ SDOT v22.4s, v9.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ SDOT v23.4s, v9.16b, v3.4b[1]
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v0.4b[1]
+ LDR d8, [x5], 8
+ SDOT v25.4s, v10.16b, v1.4b[1]
+ INS v11.d[1], x14
+ SDOT v26.4s, v10.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ SDOT v27.4s, v10.16b, v3.4b[1]
+
+ # BLOCK 4
+ SDOT v28.4s, v11.16b, v0.4b[1]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v1.4b[1]
+ INS v8.d[1], x14
+ SDOT v30.4s, v11.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ SDOT v31.4s, v11.16b, v3.4b[1]
+
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v4.4b[0]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v5.4b[0]
+ INS v9.d[1], x14
+ SDOT v18.4s, v8.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ SDOT v19.4s, v8.16b, v7.4b[0]
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v4.4b[0]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v5.4b[0]
+ INS v10.d[1], x14
+ SDOT v22.4s, v9.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ SDOT v23.4s, v9.16b, v7.4b[0]
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v4.4b[0]
+ LDR d8, [x5], 8
+ SDOT v25.4s, v10.16b, v5.4b[0]
+ INS v11.d[1], x14
+ SDOT v26.4s, v10.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ SDOT v27.4s, v10.16b, v7.4b[0]
+
+ # BLOCK 3
+ SDOT v28.4s, v11.16b, v4.4b[0]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v5.4b[0]
+ INS v8.d[1], x14
+ SDOT v30.4s, v11.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ SDOT v31.4s, v11.16b, v7.4b[0]
+
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v4.4b[1]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v5.4b[1]
+ INS v9.d[1], x14
+ SDOT v18.4s, v8.16b, v6.4b[1]
+ LDR x14, [x5], 8
+ SDOT v19.4s, v8.16b, v7.4b[1]
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v4.4b[1]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v5.4b[1]
+ INS v10.d[1], x14
+ SDOT v22.4s, v9.16b, v6.4b[1]
+ LDR x14, [x5], 8
+ SDOT v23.4s, v9.16b, v7.4b[1]
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v4.4b[1]
+ SDOT v25.4s, v10.16b, v5.4b[1]
+ INS v11.d[1], x14
+ SDOT v26.4s, v10.16b, v6.4b[1]
+ SDOT v27.4s, v10.16b, v7.4b[1]
+ AND x0, x2, 15 // kc remainder 0 to 12
+
+ # BLOCK 3
+ SDOT v28.4s, v11.16b, v4.4b[1]
+ SDOT v29.4s, v11.16b, v5.4b[1]
+ SDOT v30.4s, v11.16b, v6.4b[1]
+ SDOT v31.4s, v11.16b, v7.4b[1]
+
+ # Is there a remainder?- 4 to 12 bytes of A
+ CBNZ x0, 5f
+
+ .p2align 3
+3:
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x11], 4
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v0.4s, v16.4s
+ FMUL v17.4s, v0.4s, v17.4s
+ FMUL v18.4s, v0.4s, v18.4s
+ FMUL v19.4s, v0.4s, v19.4s
+ FMUL v20.4s, v0.4s, v20.4s
+ FMUL v21.4s, v0.4s, v21.4s
+ FMUL v22.4s, v0.4s, v22.4s
+ FMUL v23.4s, v0.4s, v23.4s
+ FMUL v24.4s, v0.4s, v24.4s
+ FMUL v25.4s, v0.4s, v25.4s
+ FMUL v26.4s, v0.4s, v26.4s
+ FMUL v27.4s, v0.4s, v27.4s
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v2.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v2.8h
+ SQADD v17.8h, v17.8h, v2.8h
+ SQADD v18.8h, v18.8h, v2.8h
+ SQADD v19.8h, v19.8h, v2.8h
+ SQADD v24.8h, v24.8h, v2.8h
+ SQADD v25.8h, v25.8h, v2.8h
+ SQADD v26.8h, v26.8h, v2.8h
+ SQADD v27.8h, v27.8h, v2.8h
+ LD1R {v0.16b}, [x11], 1 // clamp min value
+
+ SQXTN v4.8b, v16.8h
+ SQXTN v5.8b, v17.8h
+ SQXTN v6.8b, v18.8h
+ SQXTN v7.8b, v19.8h
+ LD1R {v1.16b}, [x11] // clamp max value
+ SQXTN2 v4.16b, v24.8h
+ SQXTN2 v5.16b, v25.8h
+ SQXTN2 v6.16b, v26.8h
+ SQXTN2 v7.16b, v27.8h
+ LDR x12, [sp, 32] // cn_stride
+
+ SMAX v4.16b, v4.16b, v0.16b
+ SMAX v5.16b, v5.16b, v0.16b
+ SMAX v6.16b, v6.16b, v0.16b
+ SMAX v7.16b, v7.16b, v0.16b
+ SUBS x1, x1, 16
+ SMIN v4.16b, v4.16b, v1.16b
+ SMIN v5.16b, v5.16b, v1.16b
+ SMIN v6.16b, v6.16b, v1.16b
+ SMIN v7.16b, v7.16b, v1.16b
+ B.LO 6f
+
+ # Store full 4 x 16
+ ST1 {v4.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v5.16b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v6.16b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v7.16b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 32
+ RET
+
+ # Remainder- 4 to 12 bytes of A
+ # Although C4, its safe to read 16 bytes.
+ .p2align 3
+4:
+ AND x0, x2, 15 // kc remainder 4 to 12
+5:
+ LDP q8, q9, [x5], 32
+ LDP q10, q11, [x5], 32
+ LD1 {v0.16b}, [x3], x0
+ LD1 {v1.16b}, [x15], x0
+ LD1 {v2.16b}, [x13], x0
+ LD1 {v3.16b}, [x4], x0
+ SDOT v16.4s, v8.16b, v0.4b[0]
+ SDOT v17.4s, v8.16b, v1.4b[0]
+ SDOT v18.4s, v8.16b, v2.4b[0]
+ SDOT v19.4s, v8.16b, v3.4b[0]
+ SDOT v20.4s, v9.16b, v0.4b[0]
+ SDOT v21.4s, v9.16b, v1.4b[0]
+ SDOT v22.4s, v9.16b, v2.4b[0]
+ SDOT v23.4s, v9.16b, v3.4b[0]
+ SDOT v24.4s, v10.16b, v0.4b[0]
+ SDOT v25.4s, v10.16b, v1.4b[0]
+ SDOT v26.4s, v10.16b, v2.4b[0]
+ SDOT v27.4s, v10.16b, v3.4b[0]
+ SDOT v28.4s, v11.16b, v0.4b[0]
+ SDOT v29.4s, v11.16b, v1.4b[0]
+ SDOT v30.4s, v11.16b, v2.4b[0]
+ SDOT v31.4s, v11.16b, v3.4b[0]
+ CMP x0, 4
+ B.LS 3b
+ LDP q8, q9, [x5], 32
+ LDP q10, q11, [x5], 32
+ SDOT v16.4s, v8.16b, v0.4b[1]
+ SDOT v17.4s, v8.16b, v1.4b[1]
+ SDOT v18.4s, v8.16b, v2.4b[1]
+ SDOT v19.4s, v8.16b, v3.4b[1]
+ SDOT v20.4s, v9.16b, v0.4b[1]
+ SDOT v21.4s, v9.16b, v1.4b[1]
+ SDOT v22.4s, v9.16b, v2.4b[1]
+ SDOT v23.4s, v9.16b, v3.4b[1]
+ SDOT v24.4s, v10.16b, v0.4b[1]
+ SDOT v25.4s, v10.16b, v1.4b[1]
+ SDOT v26.4s, v10.16b, v2.4b[1]
+ SDOT v27.4s, v10.16b, v3.4b[1]
+ SDOT v28.4s, v11.16b, v0.4b[1]
+ SDOT v29.4s, v11.16b, v1.4b[1]
+ SDOT v30.4s, v11.16b, v2.4b[1]
+ SDOT v31.4s, v11.16b, v3.4b[1]
+ CMP x0, 8
+ B.LS 3b
+ LDP q8, q9, [x5], 32
+ LDP q10, q11, [x5], 32
+ SDOT v16.4s, v8.16b, v0.4b[2]
+ SDOT v17.4s, v8.16b, v1.4b[2]
+ SDOT v18.4s, v8.16b, v2.4b[2]
+ SDOT v19.4s, v8.16b, v3.4b[2]
+ SDOT v20.4s, v9.16b, v0.4b[2]
+ SDOT v21.4s, v9.16b, v1.4b[2]
+ SDOT v22.4s, v9.16b, v2.4b[2]
+ SDOT v23.4s, v9.16b, v3.4b[2]
+ SDOT v24.4s, v10.16b, v0.4b[2]
+ SDOT v25.4s, v10.16b, v1.4b[2]
+ SDOT v26.4s, v10.16b, v2.4b[2]
+ SDOT v27.4s, v10.16b, v3.4b[2]
+ SDOT v28.4s, v11.16b, v0.4b[2]
+ SDOT v29.4s, v11.16b, v1.4b[2]
+ SDOT v30.4s, v11.16b, v2.4b[2]
+ SDOT v31.4s, v11.16b, v3.4b[2]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+6:
+ TBZ x1, 3, 7f
+ STR d4, [x6], 8
+ DUP d4, v4.d[1]
+ STR d5, [x8], 8
+ DUP d5, v5.d[1]
+ STR d6, [x9], 8
+ DUP d6, v6.d[1]
+ STR d7, [x7], 8
+ DUP d7, v7.d[1]
+7:
+ TBZ x1, 2, 8f
+ STR s4, [x6], 4
+ DUP s4, v4.s[1]
+ STR s5, [x8], 4
+ DUP s5, v5.s[1]
+ STR s6, [x9], 4
+ DUP s6, v6.s[1]
+ STR s7, [x7], 4
+ DUP s7, v7.s[1]
+8:
+ TBZ x1, 1, 9f
+ ST1 {v4.h}[0], [x6], 2
+ DUP h4, v4.h[1]
+ ST1 {v5.h}[0], [x8], 2
+ DUP h5, v5.h[1]
+ ST1 {v6.h}[0], [x9], 2
+ DUP h6, v6.h[1]
+ ST1 {v7.h}[0], [x7], 2
+ DUP h7, v7.h[1]
+9:
+ TBZ x1, 0, 10f
+ ST1 {v4.b}[0], [x6]
+ ST1 {v5.b}[0], [x8]
+ ST1 {v6.b}[0], [x9]
+ ST1 {v7.b}[0], [x7]
+10:
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 32
+ RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S b/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S
new file mode 100644
index 0000000..e48aaef
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld32.S
@@ -0,0 +1,274 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld32.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v4 v5 v6 v7
+# C0 x6 v16 v20 v24 v28
+# C1 x8 v17 v21 v25 v29
+# C2 x9 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+ BIC x2, x2, 3
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ LDR x11, [sp, 8] // reload params
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV x0, x2 // k = kc. assumes kc > 0
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+
+ # Main loop - 4 bytes of A
+ .p2align 3
+1:
+ LDR s0, [x3], 4
+ LDR q4, [x5], 16
+ LDR s1, [x15], 4
+ LDR s2, [x13], 4
+ LDR s3, [x4], 4
+ SDOT v16.4s, v4.16b, v0.4b[0]
+ SDOT v17.4s, v4.16b, v1.4b[0]
+ LDR q5, [x5], 16
+ SDOT v18.4s, v4.16b, v2.4b[0]
+ SDOT v19.4s, v4.16b, v3.4b[0]
+ LDR q6, [x5], 16
+ SDOT v20.4s, v5.16b, v0.4b[0]
+ SDOT v21.4s, v5.16b, v1.4b[0]
+ LDR q7, [x5], 16
+ SDOT v22.4s, v5.16b, v2.4b[0]
+ SDOT v23.4s, v5.16b, v3.4b[0]
+ SUBS x0, x0, 4
+ SDOT v24.4s, v6.16b, v0.4b[0]
+ SDOT v25.4s, v6.16b, v1.4b[0]
+ SDOT v26.4s, v6.16b, v2.4b[0]
+ SDOT v27.4s, v6.16b, v3.4b[0]
+ SDOT v28.4s, v7.16b, v0.4b[0]
+ SDOT v29.4s, v7.16b, v1.4b[0]
+ SDOT v30.4s, v7.16b, v2.4b[0]
+ SDOT v31.4s, v7.16b, v3.4b[0]
+ B.HI 1b
+
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x11], 4
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v0.4s, v16.4s
+ FMUL v17.4s, v0.4s, v17.4s
+ FMUL v18.4s, v0.4s, v18.4s
+ FMUL v19.4s, v0.4s, v19.4s
+ FMUL v20.4s, v0.4s, v20.4s
+ FMUL v21.4s, v0.4s, v21.4s
+ FMUL v22.4s, v0.4s, v22.4s
+ FMUL v23.4s, v0.4s, v23.4s
+ FMUL v24.4s, v0.4s, v24.4s
+ FMUL v25.4s, v0.4s, v25.4s
+ FMUL v26.4s, v0.4s, v26.4s
+ FMUL v27.4s, v0.4s, v27.4s
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v2.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v2.8h
+ SQADD v17.8h, v17.8h, v2.8h
+ SQADD v18.8h, v18.8h, v2.8h
+ SQADD v19.8h, v19.8h, v2.8h
+ SQADD v24.8h, v24.8h, v2.8h
+ SQADD v25.8h, v25.8h, v2.8h
+ SQADD v26.8h, v26.8h, v2.8h
+ SQADD v27.8h, v27.8h, v2.8h
+ LD1R {v0.16b}, [x11], 1 // clamp min value
+
+ SQXTN v4.8b, v16.8h
+ SQXTN v5.8b, v17.8h
+ SQXTN v6.8b, v18.8h
+ SQXTN v7.8b, v19.8h
+ LD1R {v1.16b}, [x11] // clamp max value
+ SQXTN2 v4.16b, v24.8h
+ SQXTN2 v5.16b, v25.8h
+ SQXTN2 v6.16b, v26.8h
+ SQXTN2 v7.16b, v27.8h
+ LDR x12, [sp] // cn_stride
+
+ SMAX v4.16b, v4.16b, v0.16b
+ SMAX v5.16b, v5.16b, v0.16b
+ SMAX v6.16b, v6.16b, v0.16b
+ SMAX v7.16b, v7.16b, v0.16b
+ SUBS x1, x1, 16
+ SMIN v4.16b, v4.16b, v1.16b
+ SMIN v5.16b, v5.16b, v1.16b
+ SMIN v6.16b, v6.16b, v1.16b
+ SMIN v7.16b, v7.16b, v1.16b
+ B.LO 2f
+
+ # Store full 4 x 16
+ ST1 {v4.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v5.16b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v6.16b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v7.16b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+ RET
+
+ # Store odd width
+ .p2align 3
+2:
+ TBZ x1, 3, 3f
+ STR d4, [x6], 8
+ DUP d4, v4.d[1]
+ STR d5, [x8], 8
+ DUP d5, v5.d[1]
+ STR d6, [x9], 8
+ DUP d6, v6.d[1]
+ STR d7, [x7], 8
+ DUP d7, v7.d[1]
+3:
+ TBZ x1, 2, 4f
+ STR s4, [x6], 4
+ DUP s4, v4.s[1]
+ STR s5, [x8], 4
+ DUP s5, v5.s[1]
+ STR s6, [x9], 4
+ DUP s6, v6.s[1]
+ STR s7, [x7], 4
+ DUP s7, v7.s[1]
+4:
+ TBZ x1, 1, 5f
+ ST1 {v4.h}[0], [x6], 2
+ DUP h4, v4.h[1]
+ ST1 {v5.h}[0], [x8], 2
+ DUP h5, v5.h[1]
+ ST1 {v6.h}[0], [x9], 2
+ DUP h6, v6.h[1]
+ ST1 {v7.h}[0], [x7], 2
+ DUP h7, v7.h[1]
+5:
+ TBZ x1, 0, 6f
+ ST1 {v4.b}[0], [x6]
+ ST1 {v5.b}[0], [x8]
+ ST1 {v6.b}[0], [x9]
+ ST1 {v7.b}[0], [x7]
+6:
+ RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S b/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
new file mode 100644
index 0000000..4a4303d
--- /dev/null
+++ b/src/qs8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
@@ -0,0 +1,325 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/4x16c4-aarch64-neondot-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qs8_minmax_params params) [sp + 8] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v4 v5 v6 v7
+# C0 x6 v16 v20 v24 v28
+# C1 x8 v17 v21 v25 v29
+# C2 x9 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+ BIC x2, x2, 3
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ LDR x11, [sp, 8] // params
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ SUBS x0, x2, 8 // k = kc - 8
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ # Is there at least 8 bytes?
+ B.LO 3f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+1:
+ LDR d0, [x3], 8
+ LDR q4, [x5], 16
+ LDR d1, [x15], 8
+ LDR d2, [x13], 8
+ LDR d3, [x4], 8
+ LDR q5, [x5], 16
+ SDOT v16.4s, v4.16b, v0.4b[0]
+ SDOT v17.4s, v4.16b, v1.4b[0]
+ LDP q6, q7, [x5], 32
+ SDOT v18.4s, v4.16b, v2.4b[0]
+ SDOT v19.4s, v4.16b, v3.4b[0]
+ SDOT v20.4s, v5.16b, v0.4b[0]
+ SDOT v21.4s, v5.16b, v1.4b[0]
+ SDOT v22.4s, v5.16b, v2.4b[0]
+ SDOT v23.4s, v5.16b, v3.4b[0]
+ SDOT v24.4s, v6.16b, v0.4b[0]
+ SDOT v25.4s, v6.16b, v1.4b[0]
+ LDP q4, q5, [x5], 32
+ SDOT v26.4s, v6.16b, v2.4b[0]
+ SDOT v27.4s, v6.16b, v3.4b[0]
+ SDOT v28.4s, v7.16b, v0.4b[0]
+ SDOT v29.4s, v7.16b, v1.4b[0]
+ SDOT v30.4s, v7.16b, v2.4b[0]
+ SDOT v31.4s, v7.16b, v3.4b[0]
+ SDOT v16.4s, v4.16b, v0.4b[1]
+ SDOT v17.4s, v4.16b, v1.4b[1]
+ LDP q6, q7, [x5], 32
+ SDOT v18.4s, v4.16b, v2.4b[1]
+ SDOT v19.4s, v4.16b, v3.4b[1]
+ SDOT v20.4s, v5.16b, v0.4b[1]
+ SDOT v21.4s, v5.16b, v1.4b[1]
+ SDOT v22.4s, v5.16b, v2.4b[1]
+ SDOT v23.4s, v5.16b, v3.4b[1]
+ SDOT v24.4s, v6.16b, v0.4b[1]
+ SDOT v25.4s, v6.16b, v1.4b[1]
+ SDOT v26.4s, v6.16b, v2.4b[1]
+ SDOT v27.4s, v6.16b, v3.4b[1]
+ SDOT v28.4s, v7.16b, v0.4b[1]
+ SDOT v29.4s, v7.16b, v1.4b[1]
+ SDOT v30.4s, v7.16b, v2.4b[1]
+ SUBS x0, x0, 8
+ SDOT v31.4s, v7.16b, v3.4b[1]
+ B.HS 1b
+
+ # Is there a remainder?- 4 bytes of A
+ TBNZ x0, 2, 3f
+
+2:
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x11], 4
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v0.4s, v16.4s
+ FMUL v17.4s, v0.4s, v17.4s
+ FMUL v18.4s, v0.4s, v18.4s
+ FMUL v19.4s, v0.4s, v19.4s
+ FMUL v20.4s, v0.4s, v20.4s
+ FMUL v21.4s, v0.4s, v21.4s
+ FMUL v22.4s, v0.4s, v22.4s
+ FMUL v23.4s, v0.4s, v23.4s
+ FMUL v24.4s, v0.4s, v24.4s
+ FMUL v25.4s, v0.4s, v25.4s
+ FMUL v26.4s, v0.4s, v26.4s
+ FMUL v27.4s, v0.4s, v27.4s
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v2.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v2.8h
+ SQADD v17.8h, v17.8h, v2.8h
+ SQADD v18.8h, v18.8h, v2.8h
+ SQADD v19.8h, v19.8h, v2.8h
+ SQADD v24.8h, v24.8h, v2.8h
+ SQADD v25.8h, v25.8h, v2.8h
+ SQADD v26.8h, v26.8h, v2.8h
+ SQADD v27.8h, v27.8h, v2.8h
+ LD1R {v0.16b}, [x11], 1 // clamp min value
+
+ SQXTN v4.8b, v16.8h
+ SQXTN v5.8b, v17.8h
+ SQXTN v6.8b, v18.8h
+ SQXTN v7.8b, v19.8h
+ LD1R {v1.16b}, [x11] // clamp max value
+ SQXTN2 v4.16b, v24.8h
+ SQXTN2 v5.16b, v25.8h
+ SQXTN2 v6.16b, v26.8h
+ SQXTN2 v7.16b, v27.8h
+ LDR x12, [sp] // cn_stride
+
+ SMAX v4.16b, v4.16b, v0.16b
+ SMAX v5.16b, v5.16b, v0.16b
+ SMAX v6.16b, v6.16b, v0.16b
+ SMAX v7.16b, v7.16b, v0.16b
+ SUBS x1, x1, 16
+ SMIN v4.16b, v4.16b, v1.16b
+ SMIN v5.16b, v5.16b, v1.16b
+ SMIN v6.16b, v6.16b, v1.16b
+ SMIN v7.16b, v7.16b, v1.16b
+ B.LO 4f
+
+ # Store full 4 x 16
+ ST1 {v4.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v5.16b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v6.16b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v7.16b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+ RET
+
+ # Remainder- 4 bytes of A
+ .p2align 3
+3:
+ LDR s0, [x3], 4
+ LDR q4, [x5], 16
+ LDR s1, [x15], 4
+ LDR s2, [x13], 4
+ LDR s3, [x4], 4
+ SDOT v16.4s, v4.16b, v0.4b[0]
+ LDR q5, [x5], 16
+ SDOT v17.4s, v4.16b, v1.4b[0]
+ SDOT v18.4s, v4.16b, v2.4b[0]
+ SDOT v19.4s, v4.16b, v3.4b[0]
+ SDOT v20.4s, v5.16b, v0.4b[0]
+ LDP q6, q7, [x5], 32
+ SDOT v21.4s, v5.16b, v1.4b[0]
+ SDOT v22.4s, v5.16b, v2.4b[0]
+ SDOT v23.4s, v5.16b, v3.4b[0]
+ SDOT v24.4s, v6.16b, v0.4b[0]
+ SDOT v25.4s, v6.16b, v1.4b[0]
+ SDOT v26.4s, v6.16b, v2.4b[0]
+ SDOT v27.4s, v6.16b, v3.4b[0]
+ SDOT v28.4s, v7.16b, v0.4b[0]
+ SDOT v29.4s, v7.16b, v1.4b[0]
+ SDOT v30.4s, v7.16b, v2.4b[0]
+ SDOT v31.4s, v7.16b, v3.4b[0]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+4:
+ TBZ x1, 3, 5f
+ STR d4, [x6], 8
+ DUP d4, v4.d[1]
+ STR d5, [x8], 8
+ DUP d5, v5.d[1]
+ STR d6, [x9], 8
+ DUP d6, v6.d[1]
+ STR d7, [x7], 8
+ DUP d7, v7.d[1]
+5:
+ TBZ x1, 2, 6f
+ STR s4, [x6], 4
+ DUP s4, v4.s[1]
+ STR s5, [x8], 4
+ DUP s5, v5.s[1]
+ STR s6, [x9], 4
+ DUP s6, v6.s[1]
+ STR s7, [x7], 4
+ DUP s7, v7.s[1]
+6:
+ TBZ x1, 1, 7f
+ ST1 {v4.h}[0], [x6], 2
+ DUP h4, v4.h[1]
+ ST1 {v5.h}[0], [x8], 2
+ DUP h5, v5.h[1]
+ ST1 {v6.h}[0], [x9], 2
+ DUP h6, v6.h[1]
+ ST1 {v7.h}[0], [x7], 2
+ DUP h7, v7.h[1]
+7:
+ TBZ x1, 0, 8f
+ ST1 {v4.b}[0], [x6]
+ ST1 {v5.b}[0], [x8]
+ ST1 {v6.b}[0], [x9]
+ ST1 {v7.b}[0], [x7]
+8:
+ RET
+
+END_FUNCTION xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
index 09659d3..abcb262 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
+++ b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
@@ -9,6 +9,7 @@
#include <xnnpack/assembly.h>
+
# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55(
# size_t mr, x0
# size_t nc, x1
@@ -19,7 +20,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x12
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+# const union xnn_qs8_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -69,7 +70,7 @@
LDP q24, q28, [x5], 32
MOV v19.16b, v16.16b
MOV v21.16b, v20.16b
- LDR x11, [sp, 40] // params
+ LDR x11, [sp, 40] // reload params
MOV v22.16b, v20.16b
MOV v23.16b, v20.16b
MOV v25.16b, v24.16b
@@ -408,73 +409,70 @@
.p2align 3
3:
# Apply params - scale, shift, bias and clamp
- LD1R {v0.4s}, [x11], 4
- SQRDMULH v4.4s, v16.4s, v0.4s
- SQRDMULH v5.4s, v17.4s, v0.4s
- LD1R {v1.4s}, [x11], 4
- SQRDMULH v6.4s, v18.4s, v0.4s
- SQRDMULH v7.4s, v19.4s, v0.4s
- SQRDMULH v8.4s, v20.4s, v0.4s
- SQRDMULH v9.4s, v21.4s, v0.4s
+ LD2R {v0.4s, v1.4s}, [x11], 8
CMEQ v2.4s, v1.4s, 0
- SQRDMULH v10.4s, v22.4s, v0.4s
- SQRDMULH v11.4s, v23.4s, v0.4s
- BIC v16.16b, v16.16b, v2.16b
- BIC v17.16b, v17.16b, v2.16b
- BIC v18.16b, v18.16b, v2.16b
- BIC v19.16b, v19.16b, v2.16b
- BIC v20.16b, v20.16b, v2.16b
- BIC v21.16b, v21.16b, v2.16b
- BIC v22.16b, v22.16b, v2.16b
- BIC v23.16b, v23.16b, v2.16b
+ BIC v4.16b, v16.16b, v2.16b
+ BIC v5.16b, v17.16b, v2.16b
+ BIC v6.16b, v18.16b, v2.16b
+ BIC v7.16b, v19.16b, v2.16b
- SSRA v4.4s, v16.4s, 31 // signed shift right accumulate
- SSRA v5.4s, v17.4s, 31
- SSRA v6.4s, v18.4s, 31
- SSRA v7.4s, v19.4s, 31
- SSRA v8.4s, v20.4s, 31
- SSRA v9.4s, v21.4s, 31
- SSRA v10.4s, v22.4s, 31
- SSRA v11.4s, v23.4s, 31
+ SQRDMULH v16.4s, v16.4s, v0.4s
+ SQRDMULH v17.4s, v17.4s, v0.4s
+ SQRDMULH v18.4s, v18.4s, v0.4s
+ SQRDMULH v19.4s, v19.4s, v0.4s
- SQRDMULH v16.4s, v24.4s, v0.4s
- SQRDMULH v17.4s, v25.4s, v0.4s
- SQRDMULH v18.4s, v26.4s, v0.4s
- SQRDMULH v19.4s, v27.4s, v0.4s
- SQRDMULH v20.4s, v28.4s, v0.4s
- SQRDMULH v21.4s, v29.4s, v0.4s
- SQRDMULH v22.4s, v30.4s, v0.4s
- SQRDMULH v23.4s, v31.4s, v0.4s
+ SSRA v16.4s, v4.4s, 31 // signed shift right accumulate
+ SSRA v17.4s, v5.4s, 31
+ SSRA v18.4s, v6.4s, 31
+ SSRA v19.4s, v7.4s, 31
- BIC v24.16b, v24.16b, v2.16b
- BIC v25.16b, v25.16b, v2.16b
- BIC v26.16b, v26.16b, v2.16b
- BIC v27.16b, v27.16b, v2.16b
- BIC v28.16b, v28.16b, v2.16b
- BIC v29.16b, v29.16b, v2.16b
- BIC v30.16b, v30.16b, v2.16b
- BIC v31.16b, v31.16b, v2.16b
+ BIC v4.16b, v20.16b, v2.16b
+ BIC v5.16b, v21.16b, v2.16b
+ BIC v6.16b, v22.16b, v2.16b
+ BIC v7.16b, v23.16b, v2.16b
- SSRA v16.4s, v24.4s, 31
- SSRA v17.4s, v25.4s, 31
- SSRA v18.4s, v26.4s, 31
- SSRA v19.4s, v27.4s, 31
- SSRA v20.4s, v28.4s, 31
- SSRA v21.4s, v29.4s, 31
- SSRA v22.4s, v30.4s, 31
- SSRA v23.4s, v31.4s, 31
+ SQRDMULH v20.4s, v20.4s, v0.4s
+ SQRDMULH v21.4s, v21.4s, v0.4s
+ SQRDMULH v22.4s, v22.4s, v0.4s
+ SQRDMULH v23.4s, v23.4s, v0.4s
- SRSHL v4.4s, v4.4s, v1.4s // signed rounding shift left
- SRSHL v5.4s, v5.4s, v1.4s
- SRSHL v6.4s, v6.4s, v1.4s
- SRSHL v7.4s, v7.4s, v1.4s
- SRSHL v8.4s, v8.4s, v1.4s
- SRSHL v9.4s, v9.4s, v1.4s
- SRSHL v10.4s, v10.4s, v1.4s
- SRSHL v11.4s, v11.4s, v1.4s
+ SSRA v20.4s, v4.4s, 31
+ SSRA v21.4s, v5.4s, 31
+ SSRA v22.4s, v6.4s, 31
+ SSRA v23.4s, v7.4s, 31
- SRSHL v16.4s, v16.4s, v1.4s
+ BIC v4.16b, v24.16b, v2.16b
+ BIC v5.16b, v25.16b, v2.16b
+ BIC v6.16b, v26.16b, v2.16b
+ BIC v7.16b, v27.16b, v2.16b
+
+ SQRDMULH v24.4s, v24.4s, v0.4s
+ SQRDMULH v25.4s, v25.4s, v0.4s
+ SQRDMULH v26.4s, v26.4s, v0.4s
+ SQRDMULH v27.4s, v27.4s, v0.4s
+
+ SSRA v24.4s, v4.4s, 31
+ SSRA v25.4s, v5.4s, 31
+ SSRA v26.4s, v6.4s, 31
+ SSRA v27.4s, v7.4s, 31
+
+ BIC v4.16b, v28.16b, v2.16b
+ BIC v5.16b, v29.16b, v2.16b
+ BIC v6.16b, v30.16b, v2.16b
+ BIC v7.16b, v31.16b, v2.16b
+
+ SQRDMULH v28.4s, v28.4s, v0.4s
+ SQRDMULH v29.4s, v29.4s, v0.4s
+ SQRDMULH v30.4s, v30.4s, v0.4s
+ SQRDMULH v31.4s, v31.4s, v0.4s
+
+ SSRA v28.4s, v4.4s, 31
+ SSRA v29.4s, v5.4s, 31
+ SSRA v30.4s, v6.4s, 31
+ SSRA v31.4s, v7.4s, 31
+
+ SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left
SRSHL v17.4s, v17.4s, v1.4s
SRSHL v18.4s, v18.4s, v1.4s
SRSHL v19.4s, v19.4s, v1.4s
@@ -482,56 +480,64 @@
SRSHL v21.4s, v21.4s, v1.4s
SRSHL v22.4s, v22.4s, v1.4s
SRSHL v23.4s, v23.4s, v1.4s
+ SRSHL v24.4s, v24.4s, v1.4s
+ SRSHL v25.4s, v25.4s, v1.4s
+ SRSHL v26.4s, v26.4s, v1.4s
+ SRSHL v27.4s, v27.4s, v1.4s
+ SRSHL v28.4s, v28.4s, v1.4s
+ SRSHL v29.4s, v29.4s, v1.4s
+ SRSHL v30.4s, v30.4s, v1.4s
+ SRSHL v31.4s, v31.4s, v1.4s
- SQXTN v4.4h, v4.4s
- SQXTN v5.4h, v5.4s
- SQXTN v6.4h, v6.4s
- SQXTN v7.4h, v7.4s
SQXTN v16.4h, v16.4s
SQXTN v17.4h, v17.4s
SQXTN v18.4h, v18.4s
SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
LD1R {v2.8h}, [x11], 2 // add bias
- SQXTN2 v4.8h, v8.4s
- SQXTN2 v5.8h, v9.4s
- SQXTN2 v6.8h, v10.4s
- SQXTN2 v7.8h, v11.4s
SQXTN2 v16.8h, v20.4s
SQXTN2 v17.8h, v21.4s
SQXTN2 v18.8h, v22.4s
SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
- SQADD v4.8h, v4.8h, v2.8h
- SQADD v5.8h, v5.8h, v2.8h
- SQADD v6.8h, v6.8h, v2.8h
- SQADD v7.8h, v7.8h, v2.8h
SQADD v16.8h, v16.8h, v2.8h
SQADD v17.8h, v17.8h, v2.8h
SQADD v18.8h, v18.8h, v2.8h
SQADD v19.8h, v19.8h, v2.8h
+ SQADD v24.8h, v24.8h, v2.8h
+ SQADD v25.8h, v25.8h, v2.8h
+ SQADD v26.8h, v26.8h, v2.8h
+ SQADD v27.8h, v27.8h, v2.8h
LD1R {v0.16b}, [x11], 1 // clamp min value
- SQXTN v4.8b, v4.8h
- SQXTN v5.8b, v5.8h
- SQXTN v6.8b, v6.8h
- SQXTN v7.8b, v7.8h
+ SQXTN v4.8b, v16.8h
+ SQXTN v5.8b, v17.8h
+ SQXTN v6.8b, v18.8h
+ SQXTN v7.8b, v19.8h
LD1R {v1.16b}, [x11] // clamp max value
- SQXTN2 v4.16b, v16.8h
- SQXTN2 v5.16b, v17.8h
- SQXTN2 v6.16b, v18.8h
- SQXTN2 v7.16b, v19.8h
+ SQXTN2 v4.16b, v24.8h
+ SQXTN2 v5.16b, v25.8h
+ SQXTN2 v6.16b, v26.8h
+ SQXTN2 v7.16b, v27.8h
LDR x12, [sp, 32] // cn_stride
- SMAX v4.16b, v4.16b, v0.16b
- SMAX v5.16b, v5.16b, v0.16b
- SMAX v6.16b, v6.16b, v0.16b
- SMAX v7.16b, v7.16b, v0.16b
+ SMAX v4.16b, v4.16b, v0.16b
+ SMAX v5.16b, v5.16b, v0.16b
+ SMAX v6.16b, v6.16b, v0.16b
+ SMAX v7.16b, v7.16b, v0.16b
SUBS x1, x1, 16
- SMIN v4.16b, v4.16b, v1.16b
- SMIN v5.16b, v5.16b, v1.16b
- SMIN v6.16b, v6.16b, v1.16b
- SMIN v7.16b, v7.16b, v1.16b
+ SMIN v4.16b, v4.16b, v1.16b
+ SMIN v5.16b, v5.16b, v1.16b
+ SMIN v6.16b, v6.16b, v1.16b
+ SMIN v7.16b, v7.16b, v1.16b
B.LO 6f
# Store full 4 x 16
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
index e8a6d02..55ffe3f 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
+++ b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
@@ -22,7 +22,7 @@
# size_t cn_stride, [sp] -> x12
# const union xnn_qs8_minmax_params params) [sp + 8] -> x11
-# params structure is 11 bytes
+# params structure is 12 bytes
# struct {
# int32_t multiplier;
# int32_t right_shift;
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
index 75409e7..f4eac7d 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
+++ b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld32.S
@@ -9,6 +9,7 @@
#include <xnnpack/assembly.h>
+
# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32(
# size_t mr, x0
# size_t nc, x1
@@ -19,7 +20,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x12
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+# const union xnn_qs8_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -67,7 +68,7 @@
LDP q24, q28, [x5], 32
MOV v19.16b, v16.16b
MOV v21.16b, v20.16b
- LDR x11, [sp, 8] // params
+ LDR x11, [sp, 8] // reload params
MOV v22.16b, v20.16b
MOV v23.16b, v20.16b
MOV x0, x2 // k = kc. assumes kc > 0
diff --git a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
index d5d5df8..4d482ac 100644
--- a/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+++ b/src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
@@ -9,6 +9,7 @@
#include <xnnpack/assembly.h>
+
# void xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64(
# size_t mr, x0
# size_t nc, x1
@@ -19,7 +20,7 @@
# int8_t* restrict c, x6
# size_t cm_stride, x7
# size_t cn_stride, [sp] -> x12
-# const union xnn_qs8_conv_minmax_params params) [sp + 8] -> x11
+# const union xnn_qs8_minmax_params params) [sp + 8] -> x11
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in
index 972e930..c6b61ef 100644
--- a/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in
@@ -5,7 +5,10 @@
#include <xnnpack/assembly.h>
-# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55(
# size_t mr, x0
# size_t nc, x1
# size_t kc, x2 / x0
@@ -17,7 +20,7 @@
# size_t cn_stride, [sp] -> (x0)
# size_t a_offset, [sp + 8] -> x11
# const float* zero, [sp + 16] -> x12
-# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+# const union ${CONV_PARAMS} params [sp + 24] -> (x8)
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -35,7 +38,7 @@
# x8 temp for Cortex-A55 loads
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
# Clamp C pointers
CMP x0, 2 // if mr < 2
@@ -428,132 +431,194 @@
SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
B.HI 1b
- # Apply params - scale, shift, bias and clamp
- LD1R {v0.4s}, [x8], 4
- SQRDMULH v4.4s, v16.4s, v0.4s
- SQRDMULH v5.4s, v17.4s, v0.4s
- LD1R {v1.4s}, [x8], 4
- SQRDMULH v6.4s, v18.4s, v0.4s
- SQRDMULH v7.4s, v19.4s, v0.4s
- SQRDMULH v8.4s, v20.4s, v0.4s
- SQRDMULH v9.4s, v21.4s, v0.4s
- CMEQ v2.4s, v1.4s, 0
- SQRDMULH v10.4s, v22.4s, v0.4s
- SQRDMULH v11.4s, v23.4s, v0.4s
+ $if REQUANTIZATION == "GEMMLOWP":
+ # Apply params - scale, shift, bias and clamp
+ LD2R {v0.4s, v1.4s}, [x8], 8
+ CMEQ v2.4s, v1.4s, 0
- BIC v16.16b, v16.16b, v2.16b
- BIC v17.16b, v17.16b, v2.16b
- BIC v18.16b, v18.16b, v2.16b
- BIC v19.16b, v19.16b, v2.16b
- BIC v20.16b, v20.16b, v2.16b
- BIC v21.16b, v21.16b, v2.16b
- BIC v22.16b, v22.16b, v2.16b
- BIC v23.16b, v23.16b, v2.16b
+ BIC v4.16b, v16.16b, v2.16b
+ BIC v5.16b, v17.16b, v2.16b
+ BIC v6.16b, v18.16b, v2.16b
+ BIC v7.16b, v19.16b, v2.16b
- SSRA v4.4s, v16.4s, 31 // signed shift right accumulate
- SSRA v5.4s, v17.4s, 31
- SSRA v6.4s, v18.4s, 31
- SSRA v7.4s, v19.4s, 31
- SSRA v8.4s, v20.4s, 31
- SSRA v9.4s, v21.4s, 31
- SSRA v10.4s, v22.4s, 31
- SSRA v11.4s, v23.4s, 31
+ SQRDMULH v16.4s, v16.4s, v0.4s
+ SQRDMULH v17.4s, v17.4s, v0.4s
+ SQRDMULH v18.4s, v18.4s, v0.4s
+ SQRDMULH v19.4s, v19.4s, v0.4s
- SQRDMULH v16.4s, v24.4s, v0.4s
- SQRDMULH v17.4s, v25.4s, v0.4s
- SQRDMULH v18.4s, v26.4s, v0.4s
- SQRDMULH v19.4s, v27.4s, v0.4s
- SQRDMULH v20.4s, v28.4s, v0.4s
- SQRDMULH v21.4s, v29.4s, v0.4s
- SQRDMULH v22.4s, v30.4s, v0.4s
- SQRDMULH v23.4s, v31.4s, v0.4s
+ SSRA v16.4s, v4.4s, 31 // signed shift right accumulate
+ SSRA v17.4s, v5.4s, 31
+ SSRA v18.4s, v6.4s, 31
+ SSRA v19.4s, v7.4s, 31
- BIC v24.16b, v24.16b, v2.16b
- BIC v25.16b, v25.16b, v2.16b
- BIC v26.16b, v26.16b, v2.16b
- BIC v27.16b, v27.16b, v2.16b
- BIC v28.16b, v28.16b, v2.16b
- BIC v29.16b, v29.16b, v2.16b
- BIC v30.16b, v30.16b, v2.16b
- BIC v31.16b, v31.16b, v2.16b
+ BIC v4.16b, v20.16b, v2.16b
+ BIC v5.16b, v21.16b, v2.16b
+ BIC v6.16b, v22.16b, v2.16b
+ BIC v7.16b, v23.16b, v2.16b
- SSRA v16.4s, v24.4s, 31
- SSRA v17.4s, v25.4s, 31
- SSRA v18.4s, v26.4s, 31
- SSRA v19.4s, v27.4s, 31
- SSRA v20.4s, v28.4s, 31
- SSRA v21.4s, v29.4s, 31
- SSRA v22.4s, v30.4s, 31
- SSRA v23.4s, v31.4s, 31
+ SQRDMULH v20.4s, v20.4s, v0.4s
+ SQRDMULH v21.4s, v21.4s, v0.4s
+ SQRDMULH v22.4s, v22.4s, v0.4s
+ SQRDMULH v23.4s, v23.4s, v0.4s
- SRSHL v4.4s, v4.4s, v1.4s // signed rounding shift left
- SRSHL v5.4s, v5.4s, v1.4s
- SRSHL v6.4s, v6.4s, v1.4s
- SRSHL v7.4s, v7.4s, v1.4s
- SRSHL v8.4s, v8.4s, v1.4s
- SRSHL v9.4s, v9.4s, v1.4s
- SRSHL v10.4s, v10.4s, v1.4s
- SRSHL v11.4s, v11.4s, v1.4s
+ SSRA v20.4s, v4.4s, 31
+ SSRA v21.4s, v5.4s, 31
+ SSRA v22.4s, v6.4s, 31
+ SSRA v23.4s, v7.4s, 31
- SRSHL v16.4s, v16.4s, v1.4s
- SRSHL v17.4s, v17.4s, v1.4s
- SRSHL v18.4s, v18.4s, v1.4s
- SRSHL v19.4s, v19.4s, v1.4s
- SRSHL v20.4s, v20.4s, v1.4s
- SRSHL v21.4s, v21.4s, v1.4s
- SRSHL v22.4s, v22.4s, v1.4s
- SRSHL v23.4s, v23.4s, v1.4s
+ BIC v4.16b, v24.16b, v2.16b
+ BIC v5.16b, v25.16b, v2.16b
+ BIC v6.16b, v26.16b, v2.16b
+ BIC v7.16b, v27.16b, v2.16b
- SQXTN v4.4h, v4.4s
- SQXTN v5.4h, v5.4s
- SQXTN v6.4h, v6.4s
- SQXTN v7.4h, v7.4s
+ SQRDMULH v24.4s, v24.4s, v0.4s
+ SQRDMULH v25.4s, v25.4s, v0.4s
+ SQRDMULH v26.4s, v26.4s, v0.4s
+ SQRDMULH v27.4s, v27.4s, v0.4s
+
+ SSRA v24.4s, v4.4s, 31
+ SSRA v25.4s, v5.4s, 31
+ SSRA v26.4s, v6.4s, 31
+ SSRA v27.4s, v7.4s, 31
+
+ BIC v4.16b, v28.16b, v2.16b
+ BIC v5.16b, v29.16b, v2.16b
+ BIC v6.16b, v30.16b, v2.16b
+ BIC v7.16b, v31.16b, v2.16b
+
+ SQRDMULH v28.4s, v28.4s, v0.4s
+ SQRDMULH v29.4s, v29.4s, v0.4s
+ SQRDMULH v30.4s, v30.4s, v0.4s
+ SQRDMULH v31.4s, v31.4s, v0.4s
+
+ SSRA v28.4s, v4.4s, 31
+ SSRA v29.4s, v5.4s, 31
+ SSRA v30.4s, v6.4s, 31
+ SSRA v31.4s, v7.4s, 31
+
+ SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v1.4s
+ SRSHL v18.4s, v18.4s, v1.4s
+ SRSHL v19.4s, v19.4s, v1.4s
+ SRSHL v20.4s, v20.4s, v1.4s
+ SRSHL v21.4s, v21.4s, v1.4s
+ SRSHL v22.4s, v22.4s, v1.4s
+ SRSHL v23.4s, v23.4s, v1.4s
+ SRSHL v24.4s, v24.4s, v1.4s
+ SRSHL v25.4s, v25.4s, v1.4s
+ SRSHL v26.4s, v26.4s, v1.4s
+ SRSHL v27.4s, v27.4s, v1.4s
+ SRSHL v28.4s, v28.4s, v1.4s
+ SRSHL v29.4s, v29.4s, v1.4s
+ SRSHL v30.4s, v30.4s, v1.4s
+ SRSHL v31.4s, v31.4s, v1.4s
+ $elif REQUANTIZATION == "FP32":
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x8], 4
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v0.4s, v16.4s
+ FMUL v17.4s, v0.4s, v17.4s
+ FMUL v18.4s, v0.4s, v18.4s
+ FMUL v19.4s, v0.4s, v19.4s
+ FMUL v20.4s, v0.4s, v20.4s
+ FMUL v21.4s, v0.4s, v21.4s
+ FMUL v22.4s, v0.4s, v22.4s
+ FMUL v23.4s, v0.4s, v23.4s
+ FMUL v24.4s, v0.4s, v24.4s
+ FMUL v25.4s, v0.4s, v25.4s
+ FMUL v26.4s, v0.4s, v26.4s
+ FMUL v27.4s, v0.4s, v27.4s
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
SQXTN v16.4h, v16.4s
SQXTN v17.4h, v17.4s
SQXTN v18.4h, v18.4s
SQXTN v19.4h, v19.4s
- LD1R {v2.8h}, [x8], 2 // add bias
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v2.8h}, [x8], 2 // add bias
- SQXTN2 v4.8h, v8.4s
- SQXTN2 v5.8h, v9.4s
- SQXTN2 v6.8h, v10.4s
- SQXTN2 v7.8h, v11.4s
SQXTN2 v16.8h, v20.4s
SQXTN2 v17.8h, v21.4s
SQXTN2 v18.8h, v22.4s
SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
- SQADD v4.8h, v4.8h, v2.8h
- SQADD v5.8h, v5.8h, v2.8h
- SQADD v6.8h, v6.8h, v2.8h
- SQADD v7.8h, v7.8h, v2.8h
SQADD v16.8h, v16.8h, v2.8h
SQADD v17.8h, v17.8h, v2.8h
SQADD v18.8h, v18.8h, v2.8h
SQADD v19.8h, v19.8h, v2.8h
- LD1R {v0.16b}, [x8], 1 // clamp min value
+ SQADD v24.8h, v24.8h, v2.8h
+ SQADD v25.8h, v25.8h, v2.8h
+ SQADD v26.8h, v26.8h, v2.8h
+ SQADD v27.8h, v27.8h, v2.8h
+ LD1R {v0.16b}, [x8], 1 // clamp min value
- SQXTN v4.8b, v4.8h
- SQXTN v5.8b, v5.8h
- SQXTN v6.8b, v6.8h
- SQXTN v7.8b, v7.8h
- LD1R {v1.16b}, [x8] // clamp max value
- SQXTN2 v4.16b, v16.8h
- SQXTN2 v5.16b, v17.8h
- SQXTN2 v6.16b, v18.8h
- SQXTN2 v7.16b, v19.8h
- SUB x8, x8, 11 // rewind params pointer
-
- SMAX v4.16b, v4.16b, v0.16b
- SMAX v5.16b, v5.16b, v0.16b
- LDR x0, [sp, 32] // Load cn_stride
- SMAX v6.16b, v6.16b, v0.16b
- SMAX v7.16b, v7.16b, v0.16b
+ SQXTN v4.8b, v16.8h
+ SQXTN v5.8b, v17.8h
+ SQXTN v6.8b, v18.8h
+ SQXTN v7.8b, v19.8h
+ LD1R {v1.16b}, [x8] // clamp max value
+ SQXTN2 v4.16b, v24.8h
+ SQXTN2 v5.16b, v25.8h
+ SQXTN2 v6.16b, v26.8h
+ SQXTN2 v7.16b, v27.8h
+ $if REQUANTIZATION == "GEMMLOWP":
+ SUB x8, x8, 11 // rewind params pointer
+ $elif REQUANTIZATION == "FP32":
+ SUB x8, x8, 7 // rewind params pointer
+ SMAX v4.16b, v4.16b, v0.16b
+ SMAX v5.16b, v5.16b, v0.16b
+ LDR x0, [sp, 32] // cn_stride
+ SMAX v6.16b, v6.16b, v0.16b
+ SMAX v7.16b, v7.16b, v0.16b
SUBS x1, x1, 16
- SMIN v4.16b, v4.16b, v1.16b
- SMIN v5.16b, v5.16b, v1.16b
- SMIN v6.16b, v6.16b, v1.16b
- SMIN v7.16b, v7.16b, v1.16b
+ SMIN v4.16b, v4.16b, v1.16b
+ SMIN v5.16b, v5.16b, v1.16b
+ SMIN v6.16b, v6.16b, v1.16b
+ SMIN v7.16b, v7.16b, v1.16b
B.LO 7f
# Store full 4 x 16
@@ -686,7 +751,7 @@
LDP d8, d9, [sp], 32
RET
-END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
+END_FUNCTION xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_cortex_a55
#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in b/src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in
index 879d8e6..a8f8577 100644
--- a/src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in
@@ -5,7 +5,10 @@
#include <xnnpack/assembly.h>
-# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128(
# size_t mr, x0
# size_t nc, x1
# size_t kc, x2 / x0
@@ -17,7 +20,7 @@
# size_t cn_stride, [sp] -> x10
# size_t a_offset, [sp + 8] -> x11
# const float* zero, [sp + 16] -> x12
-# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+# const union ${CONV_PARAMS} params [sp + 24] -> (x8)
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -33,7 +36,7 @@
# C3 x7 v19 v23 v27 v31
# unused v8 v9 v10 v11 v12 v13 v14 v15
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128
# Clamp C pointers
CMP x0, 2 // if mr < 2
@@ -191,86 +194,142 @@
LDR x8, [sp, 24] // reload params pointer
- # Apply params - scale, shift, bias and clamp
- LD2R {v0.4s, v1.4s}, [x8], 8
- CMEQ v2.4s, v1.4s, 0
+ $if REQUANTIZATION == "GEMMLOWP":
+ # Apply params - scale, shift, bias and clamp
+ LD2R {v0.4s, v1.4s}, [x8], 8
+ CMEQ v2.4s, v1.4s, 0
- BIC v4.16b, v16.16b, v2.16b
- BIC v5.16b, v17.16b, v2.16b
- BIC v6.16b, v18.16b, v2.16b
- BIC v7.16b, v19.16b, v2.16b
+ BIC v4.16b, v16.16b, v2.16b
+ BIC v5.16b, v17.16b, v2.16b
+ BIC v6.16b, v18.16b, v2.16b
+ BIC v7.16b, v19.16b, v2.16b
- SQRDMULH v16.4s, v16.4s, v0.4s
- SQRDMULH v17.4s, v17.4s, v0.4s
- SQRDMULH v18.4s, v18.4s, v0.4s
- SQRDMULH v19.4s, v19.4s, v0.4s
+ SQRDMULH v16.4s, v16.4s, v0.4s
+ SQRDMULH v17.4s, v17.4s, v0.4s
+ SQRDMULH v18.4s, v18.4s, v0.4s
+ SQRDMULH v19.4s, v19.4s, v0.4s
- SSRA v16.4s, v4.4s, 31 // signed shift right accumulate
- SSRA v17.4s, v5.4s, 31
- SSRA v18.4s, v6.4s, 31
- SSRA v19.4s, v7.4s, 31
+ SSRA v16.4s, v4.4s, 31 // signed shift right accumulate
+ SSRA v17.4s, v5.4s, 31
+ SSRA v18.4s, v6.4s, 31
+ SSRA v19.4s, v7.4s, 31
- BIC v4.16b, v20.16b, v2.16b
- BIC v5.16b, v21.16b, v2.16b
- BIC v6.16b, v22.16b, v2.16b
- BIC v7.16b, v23.16b, v2.16b
+ BIC v4.16b, v20.16b, v2.16b
+ BIC v5.16b, v21.16b, v2.16b
+ BIC v6.16b, v22.16b, v2.16b
+ BIC v7.16b, v23.16b, v2.16b
- SQRDMULH v20.4s, v20.4s, v0.4s
- SQRDMULH v21.4s, v21.4s, v0.4s
- SQRDMULH v22.4s, v22.4s, v0.4s
- SQRDMULH v23.4s, v23.4s, v0.4s
+ SQRDMULH v20.4s, v20.4s, v0.4s
+ SQRDMULH v21.4s, v21.4s, v0.4s
+ SQRDMULH v22.4s, v22.4s, v0.4s
+ SQRDMULH v23.4s, v23.4s, v0.4s
- SSRA v20.4s, v4.4s, 31
- SSRA v21.4s, v5.4s, 31
- SSRA v22.4s, v6.4s, 31
- SSRA v23.4s, v7.4s, 31
+ SSRA v20.4s, v4.4s, 31
+ SSRA v21.4s, v5.4s, 31
+ SSRA v22.4s, v6.4s, 31
+ SSRA v23.4s, v7.4s, 31
- BIC v4.16b, v24.16b, v2.16b
- BIC v5.16b, v25.16b, v2.16b
- BIC v6.16b, v26.16b, v2.16b
- BIC v7.16b, v27.16b, v2.16b
+ BIC v4.16b, v24.16b, v2.16b
+ BIC v5.16b, v25.16b, v2.16b
+ BIC v6.16b, v26.16b, v2.16b
+ BIC v7.16b, v27.16b, v2.16b
- SQRDMULH v24.4s, v24.4s, v0.4s
- SQRDMULH v25.4s, v25.4s, v0.4s
- SQRDMULH v26.4s, v26.4s, v0.4s
- SQRDMULH v27.4s, v27.4s, v0.4s
+ SQRDMULH v24.4s, v24.4s, v0.4s
+ SQRDMULH v25.4s, v25.4s, v0.4s
+ SQRDMULH v26.4s, v26.4s, v0.4s
+ SQRDMULH v27.4s, v27.4s, v0.4s
- SSRA v24.4s, v4.4s, 31
- SSRA v25.4s, v5.4s, 31
- SSRA v26.4s, v6.4s, 31
- SSRA v27.4s, v7.4s, 31
+ SSRA v24.4s, v4.4s, 31
+ SSRA v25.4s, v5.4s, 31
+ SSRA v26.4s, v6.4s, 31
+ SSRA v27.4s, v7.4s, 31
- BIC v4.16b, v28.16b, v2.16b
- BIC v5.16b, v29.16b, v2.16b
- BIC v6.16b, v30.16b, v2.16b
- BIC v7.16b, v31.16b, v2.16b
+ BIC v4.16b, v28.16b, v2.16b
+ BIC v5.16b, v29.16b, v2.16b
+ BIC v6.16b, v30.16b, v2.16b
+ BIC v7.16b, v31.16b, v2.16b
- SQRDMULH v28.4s, v28.4s, v0.4s
- SQRDMULH v29.4s, v29.4s, v0.4s
- SQRDMULH v30.4s, v30.4s, v0.4s
- SQRDMULH v31.4s, v31.4s, v0.4s
+ SQRDMULH v28.4s, v28.4s, v0.4s
+ SQRDMULH v29.4s, v29.4s, v0.4s
+ SQRDMULH v30.4s, v30.4s, v0.4s
+ SQRDMULH v31.4s, v31.4s, v0.4s
- SSRA v28.4s, v4.4s, 31
- SSRA v29.4s, v5.4s, 31
- SSRA v30.4s, v6.4s, 31
- SSRA v31.4s, v7.4s, 31
+ SSRA v28.4s, v4.4s, 31
+ SSRA v29.4s, v5.4s, 31
+ SSRA v30.4s, v6.4s, 31
+ SSRA v31.4s, v7.4s, 31
- SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left
- SRSHL v17.4s, v17.4s, v1.4s
- SRSHL v18.4s, v18.4s, v1.4s
- SRSHL v19.4s, v19.4s, v1.4s
- SRSHL v20.4s, v20.4s, v1.4s
- SRSHL v21.4s, v21.4s, v1.4s
- SRSHL v22.4s, v22.4s, v1.4s
- SRSHL v23.4s, v23.4s, v1.4s
- SRSHL v24.4s, v24.4s, v1.4s
- SRSHL v25.4s, v25.4s, v1.4s
- SRSHL v26.4s, v26.4s, v1.4s
- SRSHL v27.4s, v27.4s, v1.4s
- SRSHL v28.4s, v28.4s, v1.4s
- SRSHL v29.4s, v29.4s, v1.4s
- SRSHL v30.4s, v30.4s, v1.4s
- SRSHL v31.4s, v31.4s, v1.4s
+ SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v1.4s
+ SRSHL v18.4s, v18.4s, v1.4s
+ SRSHL v19.4s, v19.4s, v1.4s
+ SRSHL v20.4s, v20.4s, v1.4s
+ SRSHL v21.4s, v21.4s, v1.4s
+ SRSHL v22.4s, v22.4s, v1.4s
+ SRSHL v23.4s, v23.4s, v1.4s
+ SRSHL v24.4s, v24.4s, v1.4s
+ SRSHL v25.4s, v25.4s, v1.4s
+ SRSHL v26.4s, v26.4s, v1.4s
+ SRSHL v27.4s, v27.4s, v1.4s
+ SRSHL v28.4s, v28.4s, v1.4s
+ SRSHL v29.4s, v29.4s, v1.4s
+ SRSHL v30.4s, v30.4s, v1.4s
+ SRSHL v31.4s, v31.4s, v1.4s
+ $elif REQUANTIZATION == "FP32":
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x8], 4
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v0.4s, v16.4s
+ FMUL v17.4s, v0.4s, v17.4s
+ FMUL v18.4s, v0.4s, v18.4s
+ FMUL v19.4s, v0.4s, v19.4s
+ FMUL v20.4s, v0.4s, v20.4s
+ FMUL v21.4s, v0.4s, v21.4s
+ FMUL v22.4s, v0.4s, v22.4s
+ FMUL v23.4s, v0.4s, v23.4s
+ FMUL v24.4s, v0.4s, v24.4s
+ FMUL v25.4s, v0.4s, v25.4s
+ FMUL v26.4s, v0.4s, v26.4s
+ FMUL v27.4s, v0.4s, v27.4s
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
SQXTN v16.4h, v16.4s
SQXTN v17.4h, v17.4s
@@ -280,7 +339,7 @@
SQXTN v25.4h, v25.4s
SQXTN v26.4h, v26.4s
SQXTN v27.4h, v27.4s
- LD1R {v2.8h}, [x8], 2 // add bias
+ LD1R {v2.8h}, [x8], 2 // add bias
SQXTN2 v16.8h, v20.4s
SQXTN2 v17.8h, v21.4s
@@ -299,13 +358,13 @@
SQADD v25.8h, v25.8h, v2.8h
SQADD v26.8h, v26.8h, v2.8h
SQADD v27.8h, v27.8h, v2.8h
- LD1R {v0.16b}, [x8], 1 // clamp min value
+ LD1R {v0.16b}, [x8], 1 // clamp min value
SQXTN v4.8b, v16.8h
SQXTN v5.8b, v17.8h
SQXTN v6.8b, v18.8h
SQXTN v7.8b, v19.8h
- LD1R {v1.16b}, [x8] // clamp max value
+ LD1R {v1.16b}, [x8] // clamp max value
SQXTN2 v4.16b, v24.8h
SQXTN2 v5.16b, v25.8h
SQXTN2 v6.16b, v26.8h
@@ -454,7 +513,7 @@
10:
RET
-END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
+END_FUNCTION xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld128
#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in b/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in
index b463c7f..44e95a5 100644
--- a/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in
+++ b/src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in
@@ -5,7 +5,10 @@
#include <xnnpack/assembly.h>
-# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64(
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
+$CONV_PARAMS = "xnn_qs8_minmax_params"
+
+# void xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld64(
# size_t mr, x0
# size_t nc, x1
# size_t kc, x2 / x0
@@ -17,7 +20,7 @@
# size_t cn_stride, [sp] -> x10
# size_t a_offset, [sp + 8] -> x11
# const float* zero, [sp + 16] -> x12
-# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+# const union ${CONV_PARAMS} params [sp + 24] -> (x8)
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -33,7 +36,7 @@
# C3 x7 v19 v23 v27 v31
# unused v8 v9 v10 v11 v12 v13 v14 v15
-BEGIN_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld64
# Clamp C pointers
CMP x0, 2 // if mr < 2
@@ -151,86 +154,141 @@
B.HI 1b
3:
- # Apply params - scale, shift, bias and clamp
- LD2R {v0.4s, v1.4s}, [x8], 8
- CMEQ v2.4s, v1.4s, 0
+ $if REQUANTIZATION == "GEMMLOWP":
+ # Apply params - scale, shift, bias and clamp
+ LD2R {v0.4s, v1.4s}, [x8], 8
+ CMEQ v2.4s, v1.4s, 0
- BIC v4.16b, v16.16b, v2.16b
- BIC v5.16b, v17.16b, v2.16b
- BIC v6.16b, v18.16b, v2.16b
- BIC v7.16b, v19.16b, v2.16b
+ BIC v4.16b, v16.16b, v2.16b
+ BIC v5.16b, v17.16b, v2.16b
+ BIC v6.16b, v18.16b, v2.16b
+ BIC v7.16b, v19.16b, v2.16b
- SQRDMULH v16.4s, v16.4s, v0.4s
- SQRDMULH v17.4s, v17.4s, v0.4s
- SQRDMULH v18.4s, v18.4s, v0.4s
- SQRDMULH v19.4s, v19.4s, v0.4s
+ SQRDMULH v16.4s, v16.4s, v0.4s
+ SQRDMULH v17.4s, v17.4s, v0.4s
+ SQRDMULH v18.4s, v18.4s, v0.4s
+ SQRDMULH v19.4s, v19.4s, v0.4s
- SSRA v16.4s, v4.4s, 31 // signed shift right accumulate
- SSRA v17.4s, v5.4s, 31
- SSRA v18.4s, v6.4s, 31
- SSRA v19.4s, v7.4s, 31
+ SSRA v16.4s, v4.4s, 31 // signed shift right accumulate
+ SSRA v17.4s, v5.4s, 31
+ SSRA v18.4s, v6.4s, 31
+ SSRA v19.4s, v7.4s, 31
- BIC v4.16b, v20.16b, v2.16b
- BIC v5.16b, v21.16b, v2.16b
- BIC v6.16b, v22.16b, v2.16b
- BIC v7.16b, v23.16b, v2.16b
+ BIC v4.16b, v20.16b, v2.16b
+ BIC v5.16b, v21.16b, v2.16b
+ BIC v6.16b, v22.16b, v2.16b
+ BIC v7.16b, v23.16b, v2.16b
- SQRDMULH v20.4s, v20.4s, v0.4s
- SQRDMULH v21.4s, v21.4s, v0.4s
- SQRDMULH v22.4s, v22.4s, v0.4s
- SQRDMULH v23.4s, v23.4s, v0.4s
+ SQRDMULH v20.4s, v20.4s, v0.4s
+ SQRDMULH v21.4s, v21.4s, v0.4s
+ SQRDMULH v22.4s, v22.4s, v0.4s
+ SQRDMULH v23.4s, v23.4s, v0.4s
- SSRA v20.4s, v4.4s, 31
- SSRA v21.4s, v5.4s, 31
- SSRA v22.4s, v6.4s, 31
- SSRA v23.4s, v7.4s, 31
+ SSRA v20.4s, v4.4s, 31
+ SSRA v21.4s, v5.4s, 31
+ SSRA v22.4s, v6.4s, 31
+ SSRA v23.4s, v7.4s, 31
- BIC v4.16b, v24.16b, v2.16b
- BIC v5.16b, v25.16b, v2.16b
- BIC v6.16b, v26.16b, v2.16b
- BIC v7.16b, v27.16b, v2.16b
+ BIC v4.16b, v24.16b, v2.16b
+ BIC v5.16b, v25.16b, v2.16b
+ BIC v6.16b, v26.16b, v2.16b
+ BIC v7.16b, v27.16b, v2.16b
- SQRDMULH v24.4s, v24.4s, v0.4s
- SQRDMULH v25.4s, v25.4s, v0.4s
- SQRDMULH v26.4s, v26.4s, v0.4s
- SQRDMULH v27.4s, v27.4s, v0.4s
+ SQRDMULH v24.4s, v24.4s, v0.4s
+ SQRDMULH v25.4s, v25.4s, v0.4s
+ SQRDMULH v26.4s, v26.4s, v0.4s
+ SQRDMULH v27.4s, v27.4s, v0.4s
- SSRA v24.4s, v4.4s, 31
- SSRA v25.4s, v5.4s, 31
- SSRA v26.4s, v6.4s, 31
- SSRA v27.4s, v7.4s, 31
+ SSRA v24.4s, v4.4s, 31
+ SSRA v25.4s, v5.4s, 31
+ SSRA v26.4s, v6.4s, 31
+ SSRA v27.4s, v7.4s, 31
- BIC v4.16b, v28.16b, v2.16b
- BIC v5.16b, v29.16b, v2.16b
- BIC v6.16b, v30.16b, v2.16b
- BIC v7.16b, v31.16b, v2.16b
+ BIC v4.16b, v28.16b, v2.16b
+ BIC v5.16b, v29.16b, v2.16b
+ BIC v6.16b, v30.16b, v2.16b
+ BIC v7.16b, v31.16b, v2.16b
- SQRDMULH v28.4s, v28.4s, v0.4s
- SQRDMULH v29.4s, v29.4s, v0.4s
- SQRDMULH v30.4s, v30.4s, v0.4s
- SQRDMULH v31.4s, v31.4s, v0.4s
+ SQRDMULH v28.4s, v28.4s, v0.4s
+ SQRDMULH v29.4s, v29.4s, v0.4s
+ SQRDMULH v30.4s, v30.4s, v0.4s
+ SQRDMULH v31.4s, v31.4s, v0.4s
- SSRA v28.4s, v4.4s, 31
- SSRA v29.4s, v5.4s, 31
- SSRA v30.4s, v6.4s, 31
- SSRA v31.4s, v7.4s, 31
+ SSRA v28.4s, v4.4s, 31
+ SSRA v29.4s, v5.4s, 31
+ SSRA v30.4s, v6.4s, 31
+ SSRA v31.4s, v7.4s, 31
- SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left
- SRSHL v17.4s, v17.4s, v1.4s
- SRSHL v18.4s, v18.4s, v1.4s
- SRSHL v19.4s, v19.4s, v1.4s
- SRSHL v20.4s, v20.4s, v1.4s
- SRSHL v21.4s, v21.4s, v1.4s
- SRSHL v22.4s, v22.4s, v1.4s
- SRSHL v23.4s, v23.4s, v1.4s
- SRSHL v24.4s, v24.4s, v1.4s
- SRSHL v25.4s, v25.4s, v1.4s
- SRSHL v26.4s, v26.4s, v1.4s
- SRSHL v27.4s, v27.4s, v1.4s
- SRSHL v28.4s, v28.4s, v1.4s
- SRSHL v29.4s, v29.4s, v1.4s
- SRSHL v30.4s, v30.4s, v1.4s
- SRSHL v31.4s, v31.4s, v1.4s
+ SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left
+ SRSHL v17.4s, v17.4s, v1.4s
+ SRSHL v18.4s, v18.4s, v1.4s
+ SRSHL v19.4s, v19.4s, v1.4s
+ SRSHL v20.4s, v20.4s, v1.4s
+ SRSHL v21.4s, v21.4s, v1.4s
+ SRSHL v22.4s, v22.4s, v1.4s
+ SRSHL v23.4s, v23.4s, v1.4s
+ SRSHL v24.4s, v24.4s, v1.4s
+ SRSHL v25.4s, v25.4s, v1.4s
+ SRSHL v26.4s, v26.4s, v1.4s
+ SRSHL v27.4s, v27.4s, v1.4s
+ SRSHL v28.4s, v28.4s, v1.4s
+ SRSHL v29.4s, v29.4s, v1.4s
+ SRSHL v30.4s, v30.4s, v1.4s
+ SRSHL v31.4s, v31.4s, v1.4s
+ $elif REQUANTIZATION == "FP32":
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x8], 4
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v0.4s, v16.4s
+ FMUL v17.4s, v0.4s, v17.4s
+ FMUL v18.4s, v0.4s, v18.4s
+ FMUL v19.4s, v0.4s, v19.4s
+ FMUL v20.4s, v0.4s, v20.4s
+ FMUL v21.4s, v0.4s, v21.4s
+ FMUL v22.4s, v0.4s, v22.4s
+ FMUL v23.4s, v0.4s, v23.4s
+ FMUL v24.4s, v0.4s, v24.4s
+ FMUL v25.4s, v0.4s, v25.4s
+ FMUL v26.4s, v0.4s, v26.4s
+ FMUL v27.4s, v0.4s, v27.4s
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
SQXTN v16.4h, v16.4s
SQXTN v17.4h, v17.4s
@@ -368,7 +426,7 @@
9:
RET
-END_FUNCTION xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+END_FUNCTION xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_4x16c4__aarch64_neondot_ld64
#ifdef __ELF__
.section ".note.GNU-stack","",%progbits
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S b/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
new file mode 100644
index 0000000..beea3aa
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
@@ -0,0 +1,675 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t**restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> (x0)
+# size_t a_offset, [sp + 8] -> x11
+# const float* zero, [sp + 16] -> x12
+# const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0 v4
+# A1 x14 v1 v5
+# A2 x15 v2 v6
+# A3 x10 v3 v7
+# B x5 v8 v9 v10 v11
+# C0 x6 v16 v20 v24 v28
+# C1 x16 v17 v21 v25 v29
+# C2 x17 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v12 v13 v14 v15
+
+# x8 temp for Cortex-A55 loads
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDR x11, [sp, 8] // Load a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ LDP x12, x8, [sp, 16] // Load zero, params pointer
+ CSEL x16, x6, x16, LO // c1 = c0
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+ STP d8, d9, [sp, -32]! // Save d8-d11 on stack
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ STP d10, d11, [sp, 16]
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+ BIC x2, x2, 3
+
+ CMP x0, 4 // if mr < 4
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x10, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x11 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x11 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x11 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x10, x12 // if a3 == zero
+ ADD x10, x10, x11 // a3 += a_offset
+ CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 16 bytes for prologue/epilogue?
+ SUBS x0, x2, 16 // k = kc - 16
+ B.LO 5f
+
+ # prologue - read A and B values for block 0 and 1
+ LDR d0, [x13], 8
+ LDR q8, [x5], 16
+ LDR d1, [x14], 8
+ LDR d2, [x15], 8
+ LDR d3, [x10], 8
+ SUBS x0, x0, 16 // is there 16 for main loop?
+ LDR d9, [x5], 8
+ LDR x8, [x5], 8
+ # Is there at least 16 bytes for main loop?
+ B.LO 3f
+
+ # Main loop - 16 bytes of A in 4 groups.
+ # 4 row of 4 vectors wide = 16 sdot instructions for 4 channels
+ # 4 LD64 for A
+ # 4 LD128 for W. = 2 LD64 + INS.
+ # for each 4 sdot, 1 LD64 for A, 2 LD64 for W + INS.
+
+ .p2align 3
+2:
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v0.4b[0]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v1.4b[0]
+ INS v9.d[1], x8
+ SDOT v18.4s, v8.16b, v2.4b[0]
+ LDR x8, [x5], 8
+ SDOT v19.4s, v8.16b, v3.4b[0]
+ LDR d4, [x13], 8
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v0.4b[0]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v1.4b[0]
+ INS v10.d[1], x8
+ SDOT v22.4s, v9.16b, v2.4b[0]
+ LDR x8, [x5], 8
+ SDOT v23.4s, v9.16b, v3.4b[0]
+ LDR d5, [x14], 8
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v0.4b[0]
+ LDR d8, [x5], 8
+ SDOT v25.4s, v10.16b, v1.4b[0]
+ INS v11.d[1], x8
+ SDOT v26.4s, v10.16b, v2.4b[0]
+ LDR x8, [x5], 8
+ SDOT v27.4s, v10.16b, v3.4b[0]
+ LDR d6, [x15], 8
+
+ # BLOCK 3
+ SDOT v28.4s, v11.16b, v0.4b[0]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v1.4b[0]
+ INS v8.d[1], x8
+ SDOT v30.4s, v11.16b, v2.4b[0]
+ LDR x8, [x5], 8
+ SDOT v31.4s, v11.16b, v3.4b[0]
+ LDR d7, [x10], 8
+
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v0.4b[1]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v1.4b[1]
+ INS v9.d[1], x8
+ SDOT v18.4s, v8.16b, v2.4b[1]
+ LDR x8, [x5], 8
+ SDOT v19.4s, v8.16b, v3.4b[1]
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v0.4b[1]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v1.4b[1]
+ INS v10.d[1], x8
+ SDOT v22.4s, v9.16b, v2.4b[1]
+ LDR x8, [x5], 8
+ SDOT v23.4s, v9.16b, v3.4b[1]
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v0.4b[1]
+ LDR d8, [x5], 8
+ SDOT v25.4s, v10.16b, v1.4b[1]
+ INS v11.d[1], x8
+ SDOT v26.4s, v10.16b, v2.4b[1]
+ LDR x8, [x5], 8
+ SDOT v27.4s, v10.16b, v3.4b[1]
+
+ # BLOCK 4
+ SDOT v28.4s, v11.16b, v0.4b[1]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v1.4b[1]
+ INS v8.d[1], x8
+ SDOT v30.4s, v11.16b, v2.4b[1]
+ LDR x8, [x5], 8
+ SDOT v31.4s, v11.16b, v3.4b[1]
+
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v4.4b[0]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v5.4b[0]
+ INS v9.d[1], x8
+ SDOT v18.4s, v8.16b, v6.4b[0]
+ LDR x8, [x5], 8
+ SDOT v19.4s, v8.16b, v7.4b[0]
+ LDR d0, [x13], 8
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v4.4b[0]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v5.4b[0]
+ INS v10.d[1], x8
+ SDOT v22.4s, v9.16b, v6.4b[0]
+ LDR x8, [x5], 8
+ SDOT v23.4s, v9.16b, v7.4b[0]
+ LDR d1, [x14], 8
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v4.4b[0]
+ LDR d8, [x5], 8
+ SDOT v25.4s, v10.16b, v5.4b[0]
+ INS v11.d[1], x8
+ SDOT v26.4s, v10.16b, v6.4b[0]
+ LDR x8, [x5], 8
+ SDOT v27.4s, v10.16b, v7.4b[0]
+ LDR d2, [x15], 8
+
+ # BLOCK 3
+ SDOT v28.4s, v11.16b, v4.4b[0]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v5.4b[0]
+ INS v8.d[1], x8
+ SDOT v30.4s, v11.16b, v6.4b[0]
+ LDR x8, [x5], 8
+ SDOT v31.4s, v11.16b, v7.4b[0]
+ LDR d3, [x10], 8
+
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v4.4b[1]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v5.4b[1]
+ INS v9.d[1], x8
+ SDOT v18.4s, v8.16b, v6.4b[1]
+ LDR x8, [x5], 8
+ SDOT v19.4s, v8.16b, v7.4b[1]
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v4.4b[1]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v5.4b[1]
+ INS v10.d[1], x8
+ SDOT v22.4s, v9.16b, v6.4b[1]
+ LDR x8, [x5], 8
+ SDOT v23.4s, v9.16b, v7.4b[1]
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v4.4b[1]
+ LDR d8, [x5], 8 // First B values for block 0 and 1
+ SDOT v25.4s, v10.16b, v5.4b[1]
+ INS v11.d[1], x8
+ SDOT v26.4s, v10.16b, v6.4b[1]
+ LDR x8, [x5], 8
+ SDOT v27.4s, v10.16b, v7.4b[1]
+ SUBS x0, x0, 16
+
+ # BLOCK 3
+ SDOT v28.4s, v11.16b, v4.4b[1]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v5.4b[1]
+ INS v8.d[1], x8
+ SDOT v30.4s, v11.16b, v6.4b[1]
+ LDR x8, [x5], 8
+ SDOT v31.4s, v11.16b, v7.4b[1]
+ B.HS 2b
+
+ # Epilogue. Same as main loop but no preloads in final group
+3:
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v0.4b[0]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v1.4b[0]
+ INS v9.d[1], x8
+ SDOT v18.4s, v8.16b, v2.4b[0]
+ LDR x8, [x5], 8
+ SDOT v19.4s, v8.16b, v3.4b[0]
+ LDR d4, [x13], 8
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v0.4b[0]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v1.4b[0]
+ INS v10.d[1], x8
+ SDOT v22.4s, v9.16b, v2.4b[0]
+ LDR x8, [x5], 8
+ SDOT v23.4s, v9.16b, v3.4b[0]
+ LDR d5, [x14], 8
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v0.4b[0]
+ LDR d8, [x5], 8
+ SDOT v25.4s, v10.16b, v1.4b[0]
+ INS v11.d[1], x8
+ SDOT v26.4s, v10.16b, v2.4b[0]
+ LDR x8, [x5], 8
+ SDOT v27.4s, v10.16b, v3.4b[0]
+ LDR d6, [x15], 8
+
+ # BLOCK 3
+ SDOT v28.4s, v11.16b, v0.4b[0]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v1.4b[0]
+ INS v8.d[1], x8
+ SDOT v30.4s, v11.16b, v2.4b[0]
+ LDR x8, [x5], 8
+ SDOT v31.4s, v11.16b, v3.4b[0]
+ LDR d7, [x10], 8
+
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v0.4b[1]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v1.4b[1]
+ INS v9.d[1], x8
+ SDOT v18.4s, v8.16b, v2.4b[1]
+ LDR x8, [x5], 8
+ SDOT v19.4s, v8.16b, v3.4b[1]
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v0.4b[1]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v1.4b[1]
+ INS v10.d[1], x8
+ SDOT v22.4s, v9.16b, v2.4b[1]
+ LDR x8, [x5], 8
+ SDOT v23.4s, v9.16b, v3.4b[1]
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v0.4b[1]
+ LDR d8, [x5], 8
+ SDOT v25.4s, v10.16b, v1.4b[1]
+ INS v11.d[1], x8
+ SDOT v26.4s, v10.16b, v2.4b[1]
+ LDR x8, [x5], 8
+ SDOT v27.4s, v10.16b, v3.4b[1]
+
+ # BLOCK 4
+ SDOT v28.4s, v11.16b, v0.4b[1]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v1.4b[1]
+ INS v8.d[1], x8
+ SDOT v30.4s, v11.16b, v2.4b[1]
+ LDR x8, [x5], 8
+ SDOT v31.4s, v11.16b, v3.4b[1]
+
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v4.4b[0]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v5.4b[0]
+ INS v9.d[1], x8
+ SDOT v18.4s, v8.16b, v6.4b[0]
+ LDR x8, [x5], 8
+ SDOT v19.4s, v8.16b, v7.4b[0]
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v4.4b[0]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v5.4b[0]
+ INS v10.d[1], x8
+ SDOT v22.4s, v9.16b, v6.4b[0]
+ LDR x8, [x5], 8
+ SDOT v23.4s, v9.16b, v7.4b[0]
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v4.4b[0]
+ LDR d8, [x5], 8
+ SDOT v25.4s, v10.16b, v5.4b[0]
+ INS v11.d[1], x8
+ SDOT v26.4s, v10.16b, v6.4b[0]
+ LDR x8, [x5], 8
+ SDOT v27.4s, v10.16b, v7.4b[0]
+
+ # BLOCK 3
+ SDOT v28.4s, v11.16b, v4.4b[0]
+ LDR d9, [x5], 8
+ SDOT v29.4s, v11.16b, v5.4b[0]
+ INS v8.d[1], x8
+ SDOT v30.4s, v11.16b, v6.4b[0]
+ LDR x8, [x5], 8
+ SDOT v31.4s, v11.16b, v7.4b[0]
+
+ # BLOCK 0
+ SDOT v16.4s, v8.16b, v4.4b[1]
+ LDR d10, [x5], 8
+ SDOT v17.4s, v8.16b, v5.4b[1]
+ INS v9.d[1], x8
+ SDOT v18.4s, v8.16b, v6.4b[1]
+ LDR x8, [x5], 8
+ SDOT v19.4s, v8.16b, v7.4b[1]
+
+ # BLOCK 1
+ SDOT v20.4s, v9.16b, v4.4b[1]
+ LDR d11, [x5], 8
+ SDOT v21.4s, v9.16b, v5.4b[1]
+ INS v10.d[1], x8
+ SDOT v22.4s, v9.16b, v6.4b[1]
+ LDR x8, [x5], 8
+ SDOT v23.4s, v9.16b, v7.4b[1]
+
+ # BLOCK 2
+ SDOT v24.4s, v10.16b, v4.4b[1]
+ SDOT v25.4s, v10.16b, v5.4b[1]
+ INS v11.d[1], x8
+ SDOT v26.4s, v10.16b, v6.4b[1]
+ SDOT v27.4s, v10.16b, v7.4b[1]
+ AND x0, x2, 15 // kc remainder 0 to 12
+
+ # BLOCK 3
+ SDOT v28.4s, v11.16b, v4.4b[1]
+ SDOT v29.4s, v11.16b, v5.4b[1]
+ LDR x8, [sp, 56] // reload params pointer
+ SDOT v30.4s, v11.16b, v6.4b[1]
+ SDOT v31.4s, v11.16b, v7.4b[1]
+
+ # Is there a remainder?- 4 to 12 bytes of A
+ CBNZ x0, 6f
+
+ .p2align 3
+4:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x8], 4
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v0.4s, v16.4s
+ FMUL v17.4s, v0.4s, v17.4s
+ FMUL v18.4s, v0.4s, v18.4s
+ FMUL v19.4s, v0.4s, v19.4s
+ FMUL v20.4s, v0.4s, v20.4s
+ FMUL v21.4s, v0.4s, v21.4s
+ FMUL v22.4s, v0.4s, v22.4s
+ FMUL v23.4s, v0.4s, v23.4s
+ FMUL v24.4s, v0.4s, v24.4s
+ FMUL v25.4s, v0.4s, v25.4s
+ FMUL v26.4s, v0.4s, v26.4s
+ FMUL v27.4s, v0.4s, v27.4s
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v2.8h}, [x8], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v2.8h
+ SQADD v17.8h, v17.8h, v2.8h
+ SQADD v18.8h, v18.8h, v2.8h
+ SQADD v19.8h, v19.8h, v2.8h
+ SQADD v24.8h, v24.8h, v2.8h
+ SQADD v25.8h, v25.8h, v2.8h
+ SQADD v26.8h, v26.8h, v2.8h
+ SQADD v27.8h, v27.8h, v2.8h
+ LD1R {v0.16b}, [x8], 1 // clamp min value
+
+ SQXTN v4.8b, v16.8h
+ SQXTN v5.8b, v17.8h
+ SQXTN v6.8b, v18.8h
+ SQXTN v7.8b, v19.8h
+ LD1R {v1.16b}, [x8] // clamp max value
+ SQXTN2 v4.16b, v24.8h
+ SQXTN2 v5.16b, v25.8h
+ SQXTN2 v6.16b, v26.8h
+ SQXTN2 v7.16b, v27.8h
+ SUB x8, x8, 7 // rewind params pointer
+ SMAX v4.16b, v4.16b, v0.16b
+ SMAX v5.16b, v5.16b, v0.16b
+ LDR x0, [sp, 32] // cn_stride
+ SMAX v6.16b, v6.16b, v0.16b
+ SMAX v7.16b, v7.16b, v0.16b
+ SUBS x1, x1, 16
+ SMIN v4.16b, v4.16b, v1.16b
+ SMIN v5.16b, v5.16b, v1.16b
+ SMIN v6.16b, v6.16b, v1.16b
+ SMIN v7.16b, v7.16b, v1.16b
+ B.LO 7f
+
+ # Store full 4 x 16
+ ST1 {v7.16b}, [x7], x0
+ ST1 {v6.16b}, [x17], x0
+ ST1 {v5.16b}, [x16], x0
+ ST1 {v4.16b}, [x6], x0
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore d8-d15 from stack
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 32
+ RET
+
+ # Remainder- 4 to 12 bytes of A
+ # Although C4, its safe to read 16 bytes.
+ .p2align 3
+5:
+ AND x0, x2, 15 // kc remainder 4 to 12
+6:
+ LDR q0, [x13]
+ LDP q8, q9, [x5], 32
+ LDR q1, [x14]
+ LDR q2, [x15]
+ LDR q3, [x10]
+ LDP q10, q11, [x5], 32
+ SDOT v16.4s, v8.16b, v0.4b[0]
+ SDOT v17.4s, v8.16b, v1.4b[0]
+ SDOT v18.4s, v8.16b, v2.4b[0]
+ SDOT v19.4s, v8.16b, v3.4b[0]
+ SDOT v20.4s, v9.16b, v0.4b[0]
+ SDOT v21.4s, v9.16b, v1.4b[0]
+ SDOT v22.4s, v9.16b, v2.4b[0]
+ SDOT v23.4s, v9.16b, v3.4b[0]
+ SDOT v24.4s, v10.16b, v0.4b[0]
+ SDOT v25.4s, v10.16b, v1.4b[0]
+ SDOT v26.4s, v10.16b, v2.4b[0]
+ SDOT v27.4s, v10.16b, v3.4b[0]
+ SDOT v28.4s, v11.16b, v0.4b[0]
+ SDOT v29.4s, v11.16b, v1.4b[0]
+ SDOT v30.4s, v11.16b, v2.4b[0]
+ SDOT v31.4s, v11.16b, v3.4b[0]
+ CMP x0, 4
+ B.LS 4b
+ LDP q8, q9, [x5], 32
+ LDP q10, q11, [x5], 32
+ SDOT v16.4s, v8.16b, v0.4b[1]
+ SDOT v17.4s, v8.16b, v1.4b[1]
+ SDOT v18.4s, v8.16b, v2.4b[1]
+ SDOT v19.4s, v8.16b, v3.4b[1]
+ SDOT v20.4s, v9.16b, v0.4b[1]
+ SDOT v21.4s, v9.16b, v1.4b[1]
+ SDOT v22.4s, v9.16b, v2.4b[1]
+ SDOT v23.4s, v9.16b, v3.4b[1]
+ SDOT v24.4s, v10.16b, v0.4b[1]
+ SDOT v25.4s, v10.16b, v1.4b[1]
+ SDOT v26.4s, v10.16b, v2.4b[1]
+ SDOT v27.4s, v10.16b, v3.4b[1]
+ SDOT v28.4s, v11.16b, v0.4b[1]
+ SDOT v29.4s, v11.16b, v1.4b[1]
+ SDOT v30.4s, v11.16b, v2.4b[1]
+ SDOT v31.4s, v11.16b, v3.4b[1]
+ CMP x0, 8
+ B.LS 4b
+ LDP q8, q9, [x5], 32
+ LDP q10, q11, [x5], 32
+ SDOT v16.4s, v8.16b, v0.4b[2]
+ SDOT v17.4s, v8.16b, v1.4b[2]
+ SDOT v18.4s, v8.16b, v2.4b[2]
+ SDOT v19.4s, v8.16b, v3.4b[2]
+ SDOT v20.4s, v9.16b, v0.4b[2]
+ SDOT v21.4s, v9.16b, v1.4b[2]
+ SDOT v22.4s, v9.16b, v2.4b[2]
+ SDOT v23.4s, v9.16b, v3.4b[2]
+ SDOT v24.4s, v10.16b, v0.4b[2]
+ SDOT v25.4s, v10.16b, v1.4b[2]
+ SDOT v26.4s, v10.16b, v2.4b[2]
+ SDOT v27.4s, v10.16b, v3.4b[2]
+ SDOT v28.4s, v11.16b, v0.4b[2]
+ SDOT v29.4s, v11.16b, v1.4b[2]
+ SDOT v30.4s, v11.16b, v2.4b[2]
+ SDOT v31.4s, v11.16b, v3.4b[2]
+ B 4b
+
+ # Store odd width
+ .p2align 3
+7:
+ TBZ x1, 3, 8f
+ STR d7, [x7], 8
+ DUP d7, v7.d[1]
+ STR d6, [x17], 8
+ DUP d6, v6.d[1]
+ STR d5, [x16], 8
+ DUP d5, v5.d[1]
+ STR d4, [x6], 8
+ DUP d4, v4.d[1]
+8:
+ TBZ x1, 2, 9f
+ STR s7, [x7], 4
+ DUP s7, v7.s[1]
+ STR s6, [x17], 4
+ DUP s6, v6.s[1]
+ STR s5, [x16], 4
+ DUP s5, v5.s[1]
+ STR s4, [x6], 4
+ DUP s4, v4.s[1]
+9:
+ TBZ x1, 1, 10f
+ ST1 {v7.h}[0], [x7], 2
+ DUP h7, v7.h[1]
+ ST1 {v6.h}[0], [x17], 2
+ DUP h6, v6.h[1]
+ ST1 {v5.h}[0], [x16], 2
+ DUP h5, v5.h[1]
+ ST1 {v4.h}[0], [x6], 2
+ DUP h4, v4.h[1]
+10:
+ TBZ x1, 0, 11f
+ ST1 {v7.b}[0], [x7]
+ ST1 {v6.b}[0], [x17]
+ ST1 {v5.b}[0], [x16]
+ ST1 {v4.b}[0], [x6]
+11:
+ # Restore d8-d15 from stack
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 32
+ RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S b/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
new file mode 100644
index 0000000..351eef4
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
@@ -0,0 +1,440 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/4x16c4-aarch64-neondot-ld128.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t**restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x10
+# size_t a_offset, [sp + 8] -> x11
+# const float* zero, [sp + 16] -> x12
+# const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x8 v3
+# B x5 v4 v5 v6 v7
+# C0 x6 v16 v20 v24 v28
+# C1 x16 v17 v21 v25 v29
+# C2 x17 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x10, x11, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ LDR x12, [sp, 16] // Load zero pointer
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+ BIC x2, x2, 3
+
+ CMP x0, 4 // if mr < 4
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x8, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x11 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x11 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x11 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x8, x12 // if a3 == zero
+ ADD x8, x8, x11 // a3 += a_offset
+ CSEL x8, x12, x8, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 16 bytes for main loop?
+ SUBS x0, x2, 16 // k = kc - 16
+ B.LO 4f
+
+ # Main loop - 16 bytes of A
+ .p2align 3
+2:
+ LDR q0, [x13], 16
+ LDR q4, [x5], 16
+ LDR q1, [x14], 16
+ LDR q2, [x15], 16
+ LDR q3, [x8], 16
+ LDR q5, [x5], 16
+ SDOT v16.4s, v4.16b, v0.4b[0]
+ SDOT v17.4s, v4.16b, v1.4b[0]
+ LDP q6, q7, [x5], 32
+ SDOT v18.4s, v4.16b, v2.4b[0]
+ SDOT v19.4s, v4.16b, v3.4b[0]
+ SDOT v20.4s, v5.16b, v0.4b[0]
+ SDOT v21.4s, v5.16b, v1.4b[0]
+ SDOT v22.4s, v5.16b, v2.4b[0]
+ SDOT v23.4s, v5.16b, v3.4b[0]
+ SDOT v24.4s, v6.16b, v0.4b[0]
+ SDOT v25.4s, v6.16b, v1.4b[0]
+ LDP q4, q5, [x5], 32
+ SDOT v26.4s, v6.16b, v2.4b[0]
+ SDOT v27.4s, v6.16b, v3.4b[0]
+ SDOT v28.4s, v7.16b, v0.4b[0]
+ SDOT v29.4s, v7.16b, v1.4b[0]
+ SDOT v30.4s, v7.16b, v2.4b[0]
+ SDOT v31.4s, v7.16b, v3.4b[0]
+
+ SDOT v16.4s, v4.16b, v0.4b[1]
+ SDOT v17.4s, v4.16b, v1.4b[1]
+ LDP q6, q7, [x5], 32
+ SDOT v18.4s, v4.16b, v2.4b[1]
+ SDOT v19.4s, v4.16b, v3.4b[1]
+ SDOT v20.4s, v5.16b, v0.4b[1]
+ SDOT v21.4s, v5.16b, v1.4b[1]
+ SDOT v22.4s, v5.16b, v2.4b[1]
+ SDOT v23.4s, v5.16b, v3.4b[1]
+ SDOT v24.4s, v6.16b, v0.4b[1]
+ SDOT v25.4s, v6.16b, v1.4b[1]
+ LDP q4, q5, [x5], 32
+ SDOT v26.4s, v6.16b, v2.4b[1]
+ SDOT v27.4s, v6.16b, v3.4b[1]
+ SDOT v28.4s, v7.16b, v0.4b[1]
+ SDOT v29.4s, v7.16b, v1.4b[1]
+ SDOT v30.4s, v7.16b, v2.4b[1]
+ SDOT v31.4s, v7.16b, v3.4b[1]
+
+ SDOT v16.4s, v4.16b, v0.4b[2]
+ SDOT v17.4s, v4.16b, v1.4b[2]
+ LDP q6, q7, [x5], 32
+ SDOT v18.4s, v4.16b, v2.4b[2]
+ SDOT v19.4s, v4.16b, v3.4b[2]
+ SDOT v20.4s, v5.16b, v0.4b[2]
+ SDOT v21.4s, v5.16b, v1.4b[2]
+ SDOT v22.4s, v5.16b, v2.4b[2]
+ SDOT v23.4s, v5.16b, v3.4b[2]
+ SDOT v24.4s, v6.16b, v0.4b[2]
+ SDOT v25.4s, v6.16b, v1.4b[2]
+ LDP q4, q5, [x5], 32
+ SDOT v26.4s, v6.16b, v2.4b[2]
+ SDOT v27.4s, v6.16b, v3.4b[2]
+ SDOT v28.4s, v7.16b, v0.4b[2]
+ SDOT v29.4s, v7.16b, v1.4b[2]
+ SDOT v30.4s, v7.16b, v2.4b[2]
+ SDOT v31.4s, v7.16b, v3.4b[2]
+
+ SDOT v16.4s, v4.16b, v0.4b[3]
+ SDOT v17.4s, v4.16b, v1.4b[3]
+ LDP q6, q7, [x5], 32
+ SDOT v18.4s, v4.16b, v2.4b[3]
+ SDOT v19.4s, v4.16b, v3.4b[3]
+ SDOT v20.4s, v5.16b, v0.4b[3]
+ SDOT v21.4s, v5.16b, v1.4b[3]
+ SDOT v22.4s, v5.16b, v2.4b[3]
+ SDOT v23.4s, v5.16b, v3.4b[3]
+ SDOT v24.4s, v6.16b, v0.4b[3]
+ SDOT v25.4s, v6.16b, v1.4b[3]
+ SDOT v26.4s, v6.16b, v2.4b[3]
+ SDOT v27.4s, v6.16b, v3.4b[3]
+ SUBS x0, x0, 16
+ SDOT v28.4s, v7.16b, v0.4b[3]
+ SDOT v29.4s, v7.16b, v1.4b[3]
+ SDOT v30.4s, v7.16b, v2.4b[3]
+ SDOT v31.4s, v7.16b, v3.4b[3]
+ B.HS 2b
+
+ # Is there a remainder?- 4 to 12 bytes of A
+ TST x0, 15
+ B.NE 4f
+
+3:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+ LDR x8, [sp, 24] // reload params pointer
+
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x8], 4
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v0.4s, v16.4s
+ FMUL v17.4s, v0.4s, v17.4s
+ FMUL v18.4s, v0.4s, v18.4s
+ FMUL v19.4s, v0.4s, v19.4s
+ FMUL v20.4s, v0.4s, v20.4s
+ FMUL v21.4s, v0.4s, v21.4s
+ FMUL v22.4s, v0.4s, v22.4s
+ FMUL v23.4s, v0.4s, v23.4s
+ FMUL v24.4s, v0.4s, v24.4s
+ FMUL v25.4s, v0.4s, v25.4s
+ FMUL v26.4s, v0.4s, v26.4s
+ FMUL v27.4s, v0.4s, v27.4s
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v2.8h}, [x8], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v2.8h
+ SQADD v17.8h, v17.8h, v2.8h
+ SQADD v18.8h, v18.8h, v2.8h
+ SQADD v19.8h, v19.8h, v2.8h
+ SQADD v24.8h, v24.8h, v2.8h
+ SQADD v25.8h, v25.8h, v2.8h
+ SQADD v26.8h, v26.8h, v2.8h
+ SQADD v27.8h, v27.8h, v2.8h
+ LD1R {v0.16b}, [x8], 1 // clamp min value
+
+ SQXTN v4.8b, v16.8h
+ SQXTN v5.8b, v17.8h
+ SQXTN v6.8b, v18.8h
+ SQXTN v7.8b, v19.8h
+ LD1R {v1.16b}, [x8] // clamp max value
+ SQXTN2 v4.16b, v24.8h
+ SQXTN2 v5.16b, v25.8h
+ SQXTN2 v6.16b, v26.8h
+ SQXTN2 v7.16b, v27.8h
+
+ SMAX v4.16b, v4.16b, v0.16b
+ SMAX v5.16b, v5.16b, v0.16b
+ SMAX v6.16b, v6.16b, v0.16b
+ SMAX v7.16b, v7.16b, v0.16b
+ SUBS x1, x1, 16
+ SMIN v4.16b, v4.16b, v1.16b
+ SMIN v5.16b, v5.16b, v1.16b
+ SMIN v6.16b, v6.16b, v1.16b
+ SMIN v7.16b, v7.16b, v1.16b
+ B.LO 6f
+
+ # Store full 4 x 16
+ ST1 {v7.16b}, [x7], x10
+ ST1 {v6.16b}, [x17], x10
+ ST1 {v5.16b}, [x16], x10
+ ST1 {v4.16b}, [x6], x10
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+ RET
+
+ # Remainder- 8 bytes of A
+ .p2align 3
+4:
+ # Is there a remainder?- 8 bytes of A
+ TBZ x0, 3, 5f
+
+ LDR d0, [x13], 8
+ LDR q4, [x5], 16
+ LDR d1, [x14], 8
+ LDR d2, [x15], 8
+ LDR d3, [x8], 8
+ LDR q5, [x5], 16
+ SDOT v16.4s, v4.16b, v0.4b[0]
+ SDOT v17.4s, v4.16b, v1.4b[0]
+ LDP q6, q7, [x5], 32
+ SDOT v18.4s, v4.16b, v2.4b[0]
+ SDOT v19.4s, v4.16b, v3.4b[0]
+ SDOT v20.4s, v5.16b, v0.4b[0]
+ SDOT v21.4s, v5.16b, v1.4b[0]
+ SDOT v22.4s, v5.16b, v2.4b[0]
+ SDOT v23.4s, v5.16b, v3.4b[0]
+ SDOT v24.4s, v6.16b, v0.4b[0]
+ SDOT v25.4s, v6.16b, v1.4b[0]
+ LDP q4, q5, [x5], 32
+ SDOT v26.4s, v6.16b, v2.4b[0]
+ SDOT v27.4s, v6.16b, v3.4b[0]
+ SDOT v28.4s, v7.16b, v0.4b[0]
+ SDOT v29.4s, v7.16b, v1.4b[0]
+ SDOT v30.4s, v7.16b, v2.4b[0]
+ SDOT v31.4s, v7.16b, v3.4b[0]
+ SDOT v16.4s, v4.16b, v0.4b[1]
+ SDOT v17.4s, v4.16b, v1.4b[1]
+ LDP q6, q7, [x5], 32
+ SDOT v18.4s, v4.16b, v2.4b[1]
+ SDOT v19.4s, v4.16b, v3.4b[1]
+ SDOT v20.4s, v5.16b, v0.4b[1]
+ SDOT v21.4s, v5.16b, v1.4b[1]
+ SDOT v22.4s, v5.16b, v2.4b[1]
+ SDOT v23.4s, v5.16b, v3.4b[1]
+ SDOT v24.4s, v6.16b, v0.4b[1]
+ SDOT v25.4s, v6.16b, v1.4b[1]
+ SDOT v26.4s, v6.16b, v2.4b[1]
+ SDOT v27.4s, v6.16b, v3.4b[1]
+ SDOT v28.4s, v7.16b, v0.4b[1]
+ SDOT v29.4s, v7.16b, v1.4b[1]
+ SDOT v30.4s, v7.16b, v2.4b[1]
+ SDOT v31.4s, v7.16b, v3.4b[1]
+
+ # Remainder- 4 bytes of A
+5:
+ # Is there a remainder?- 4 bytes of A
+ TBZ x0, 2, 3b
+
+ LDR s0, [x13], 4
+ LDR q4, [x5], 16
+ LDR s1, [x14], 4
+ LDR s2, [x15], 4
+ LDR s3, [x8], 4
+ LDR q5, [x5], 16
+ SDOT v16.4s, v4.16b, v0.4b[0]
+ SDOT v17.4s, v4.16b, v1.4b[0]
+ LDP q6, q7, [x5], 32
+ SDOT v18.4s, v4.16b, v2.4b[0]
+ SDOT v19.4s, v4.16b, v3.4b[0]
+ SDOT v20.4s, v5.16b, v0.4b[0]
+ SDOT v21.4s, v5.16b, v1.4b[0]
+ SDOT v22.4s, v5.16b, v2.4b[0]
+ SDOT v23.4s, v5.16b, v3.4b[0]
+ LDR x8, [sp, 24] // reload params pointer
+ SDOT v24.4s, v6.16b, v0.4b[0]
+ SDOT v25.4s, v6.16b, v1.4b[0]
+ SDOT v26.4s, v6.16b, v2.4b[0]
+ SDOT v27.4s, v6.16b, v3.4b[0]
+ SDOT v28.4s, v7.16b, v0.4b[0]
+ SDOT v29.4s, v7.16b, v1.4b[0]
+ SDOT v30.4s, v7.16b, v2.4b[0]
+ SDOT v31.4s, v7.16b, v3.4b[0]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+6:
+ TBZ x1, 3, 7f
+ STR d7, [x7], 8
+ DUP d7, v7.d[1]
+ STR d6, [x17], 8
+ DUP d6, v6.d[1]
+ STR d5, [x16], 8
+ DUP d5, v5.d[1]
+ STR d4, [x6], 8
+ DUP d4, v4.d[1]
+7:
+ TBZ x1, 2, 8f
+ STR s7, [x7], 4
+ DUP s7, v7.s[1]
+ STR s6, [x17], 4
+ DUP s6, v6.s[1]
+ STR s5, [x16], 4
+ DUP s5, v5.s[1]
+ STR s4, [x6], 4
+ DUP s4, v4.s[1]
+8:
+ TBZ x1, 1, 9f
+ ST1 {v7.h}[0], [x7], 2
+ DUP h7, v7.h[1]
+ ST1 {v6.h}[0], [x17], 2
+ DUP h6, v6.h[1]
+ ST1 {v5.h}[0], [x16], 2
+ DUP h5, v5.h[1]
+ ST1 {v4.h}[0], [x6], 2
+ DUP h4, v4.h[1]
+9:
+ TBZ x1, 0, 10f
+ ST1 {v7.b}[0], [x7]
+ ST1 {v6.b}[0], [x17]
+ ST1 {v5.b}[0], [x16]
+ ST1 {v4.b}[0], [x6]
+10:
+ RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S b/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
new file mode 100644
index 0000000..f6cf7f5
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld64.S
@@ -0,0 +1,353 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/4x16c4-aarch64-neondot-ld64.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <xnnpack/assembly.h>
+
+
+# void xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t**restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x10
+# size_t a_offset, [sp + 8] -> x11
+# const float* zero, [sp + 16] -> x12
+# const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x8 v3
+# B x5 v4 v5 v6 v7
+# C0 x6 v16 v20 v24 v28
+# C1 x16 v17 v21 v25 v29
+# C2 x17 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# unused v8 v9 v10 v11 v12 v13 v14 v15
+
+BEGIN_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x10, x11, [sp] // Load cn_stride, a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ LDR x12, [sp, 16] // Load zero pointer
+ // if mr <= 2
+ CSEL x17, x16, x17, LS // c2 = c1
+ BIC x2, x2, 3
+
+ CMP x0, 4 // if mr < 4
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ CSEL x7, x17, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x8, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x11 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x11 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x11 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x8, x12 // if a3 == zero
+ ADD x8, x8, x11 // a3 += a_offset
+ CSEL x8, x12, x8, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 8 bytes for main loop?
+ SUBS x0, x2, 8 // k = kc - 8
+ B.LO 4f
+
+ # Main loop - 8 bytes of A
+ .p2align 3
+2:
+ LDR d0, [x13], 8
+ LDR q4, [x5], 16
+ LDR d1, [x14], 8
+ LDR d2, [x15], 8
+ LDR d3, [x8], 8
+ LDR q5, [x5], 16
+ SDOT v16.4s, v4.16b, v0.4b[0]
+ SDOT v17.4s, v4.16b, v1.4b[0]
+ LDP q6, q7, [x5], 32
+ SDOT v18.4s, v4.16b, v2.4b[0]
+ SDOT v19.4s, v4.16b, v3.4b[0]
+ SDOT v20.4s, v5.16b, v0.4b[0]
+ SDOT v21.4s, v5.16b, v1.4b[0]
+ SDOT v22.4s, v5.16b, v2.4b[0]
+ SDOT v23.4s, v5.16b, v3.4b[0]
+ SDOT v24.4s, v6.16b, v0.4b[0]
+ SDOT v25.4s, v6.16b, v1.4b[0]
+ LDP q4, q5, [x5], 32
+ SDOT v26.4s, v6.16b, v2.4b[0]
+ SDOT v27.4s, v6.16b, v3.4b[0]
+ SDOT v28.4s, v7.16b, v0.4b[0]
+ SDOT v29.4s, v7.16b, v1.4b[0]
+ SDOT v30.4s, v7.16b, v2.4b[0]
+ SDOT v31.4s, v7.16b, v3.4b[0]
+ SDOT v16.4s, v4.16b, v0.4b[1]
+ SDOT v17.4s, v4.16b, v1.4b[1]
+ LDP q6, q7, [x5], 32
+ SDOT v18.4s, v4.16b, v2.4b[1]
+ SDOT v19.4s, v4.16b, v3.4b[1]
+ SDOT v20.4s, v5.16b, v0.4b[1]
+ SDOT v21.4s, v5.16b, v1.4b[1]
+ SDOT v22.4s, v5.16b, v2.4b[1]
+ SDOT v23.4s, v5.16b, v3.4b[1]
+ SDOT v24.4s, v6.16b, v0.4b[1]
+ SDOT v25.4s, v6.16b, v1.4b[1]
+ SDOT v26.4s, v6.16b, v2.4b[1]
+ SDOT v27.4s, v6.16b, v3.4b[1]
+ SDOT v28.4s, v7.16b, v0.4b[1]
+ SDOT v29.4s, v7.16b, v1.4b[1]
+ SDOT v30.4s, v7.16b, v2.4b[1]
+ SUBS x0, x0, 8
+ SDOT v31.4s, v7.16b, v3.4b[1]
+ B.HS 2b
+
+ # Is there a remainder?- 4 bytes of A
+ TBNZ x0, 2, 4f
+
+ LDR x8, [sp, 24] // reload params pointer
+
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+3:
+ # Apply params - scale, bias and clamp
+ LD1R {v0.4s}, [x8], 4
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v0.4s, v16.4s
+ FMUL v17.4s, v0.4s, v17.4s
+ FMUL v18.4s, v0.4s, v18.4s
+ FMUL v19.4s, v0.4s, v19.4s
+ FMUL v20.4s, v0.4s, v20.4s
+ FMUL v21.4s, v0.4s, v21.4s
+ FMUL v22.4s, v0.4s, v22.4s
+ FMUL v23.4s, v0.4s, v23.4s
+ FMUL v24.4s, v0.4s, v24.4s
+ FMUL v25.4s, v0.4s, v25.4s
+ FMUL v26.4s, v0.4s, v26.4s
+ FMUL v27.4s, v0.4s, v27.4s
+ FMUL v28.4s, v0.4s, v28.4s
+ FMUL v29.4s, v0.4s, v29.4s
+ FMUL v30.4s, v0.4s, v30.4s
+ FMUL v31.4s, v0.4s, v31.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v2.8h}, [x8], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v2.8h
+ SQADD v17.8h, v17.8h, v2.8h
+ SQADD v18.8h, v18.8h, v2.8h
+ SQADD v19.8h, v19.8h, v2.8h
+ SQADD v24.8h, v24.8h, v2.8h
+ SQADD v25.8h, v25.8h, v2.8h
+ SQADD v26.8h, v26.8h, v2.8h
+ SQADD v27.8h, v27.8h, v2.8h
+ LD1R {v0.16b}, [x8], 1 // clamp min value
+
+ SQXTN v4.8b, v16.8h
+ SQXTN v5.8b, v17.8h
+ SQXTN v6.8b, v18.8h
+ SQXTN v7.8b, v19.8h
+ LD1R {v1.16b}, [x8] // clamp max value
+ SQXTN2 v4.16b, v24.8h
+ SQXTN2 v5.16b, v25.8h
+ SQXTN2 v6.16b, v26.8h
+ SQXTN2 v7.16b, v27.8h
+
+ SMAX v4.16b, v4.16b, v0.16b
+ SMAX v5.16b, v5.16b, v0.16b
+ SMAX v6.16b, v6.16b, v0.16b
+ SMAX v7.16b, v7.16b, v0.16b
+ SUBS x1, x1, 16
+ SMIN v4.16b, v4.16b, v1.16b
+ SMIN v5.16b, v5.16b, v1.16b
+ SMIN v6.16b, v6.16b, v1.16b
+ SMIN v7.16b, v7.16b, v1.16b
+ B.LO 5f
+
+ # Store full 4 x 16
+ ST1 {v7.16b}, [x7], x10
+ ST1 {v6.16b}, [x17], x10
+ ST1 {v5.16b}, [x16], x10
+ ST1 {v4.16b}, [x6], x10
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+ RET
+
+ # Remainder- 4 bytes of A
+ .p2align 3
+4:
+ LDR s0, [x13], 4
+ LDR q4, [x5], 16
+ LDR s1, [x14], 4
+ LDR s2, [x15], 4
+ LDR s3, [x8], 4
+ LDR q5, [x5], 16
+ SDOT v16.4s, v4.16b, v0.4b[0]
+ SDOT v17.4s, v4.16b, v1.4b[0]
+ LDP q6, q7, [x5], 32
+ SDOT v18.4s, v4.16b, v2.4b[0]
+ SDOT v19.4s, v4.16b, v3.4b[0]
+ SDOT v20.4s, v5.16b, v0.4b[0]
+ SDOT v21.4s, v5.16b, v1.4b[0]
+ SDOT v22.4s, v5.16b, v2.4b[0]
+ SDOT v23.4s, v5.16b, v3.4b[0]
+ LDR x8, [sp, 24] // reload params pointer
+ SDOT v24.4s, v6.16b, v0.4b[0]
+ SDOT v25.4s, v6.16b, v1.4b[0]
+ SDOT v26.4s, v6.16b, v2.4b[0]
+ SDOT v27.4s, v6.16b, v3.4b[0]
+ SDOT v28.4s, v7.16b, v0.4b[0]
+ SDOT v29.4s, v7.16b, v1.4b[0]
+ SDOT v30.4s, v7.16b, v2.4b[0]
+ SDOT v31.4s, v7.16b, v3.4b[0]
+
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+ B 3b
+
+ # Store odd width
+ .p2align 3
+5:
+ TBZ x1, 3, 6f
+ STR d7, [x7], 8
+ DUP d7, v7.d[1]
+ STR d6, [x17], 8
+ DUP d6, v6.d[1]
+ STR d5, [x16], 8
+ DUP d5, v5.d[1]
+ STR d4, [x6], 8
+ DUP d4, v4.d[1]
+6:
+ TBZ x1, 2, 7f
+ STR s7, [x7], 4
+ DUP s7, v7.s[1]
+ STR s6, [x17], 4
+ DUP s6, v6.s[1]
+ STR s5, [x16], 4
+ DUP s5, v5.s[1]
+ STR s4, [x6], 4
+ DUP s4, v4.s[1]
+7:
+ TBZ x1, 1, 8f
+ ST1 {v7.h}[0], [x7], 2
+ DUP h7, v7.h[1]
+ ST1 {v6.h}[0], [x17], 2
+ DUP h6, v6.h[1]
+ ST1 {v5.h}[0], [x16], 2
+ DUP h5, v5.h[1]
+ ST1 {v4.h}[0], [x6], 2
+ DUP h4, v4.h[1]
+8:
+ TBZ x1, 0, 9f
+ ST1 {v7.b}[0], [x7]
+ ST1 {v6.b}[0], [x17]
+ ST1 {v5.b}[0], [x16]
+ ST1 {v4.b}[0], [x6]
+9:
+ RET
+
+END_FUNCTION xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
index c7cbfee..087dfbe 100644
--- a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
+++ b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-cortex-a55.S
@@ -9,6 +9,7 @@
#include <xnnpack/assembly.h>
+
# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55(
# size_t mr, x0
# size_t nc, x1
@@ -21,7 +22,7 @@
# size_t cn_stride, [sp] -> (x0)
# size_t a_offset, [sp + 8] -> x11
# const float* zero, [sp + 16] -> x12
-# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+# const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -433,73 +434,70 @@
B.HI 1b
# Apply params - scale, shift, bias and clamp
- LD1R {v0.4s}, [x8], 4
- SQRDMULH v4.4s, v16.4s, v0.4s
- SQRDMULH v5.4s, v17.4s, v0.4s
- LD1R {v1.4s}, [x8], 4
- SQRDMULH v6.4s, v18.4s, v0.4s
- SQRDMULH v7.4s, v19.4s, v0.4s
- SQRDMULH v8.4s, v20.4s, v0.4s
- SQRDMULH v9.4s, v21.4s, v0.4s
+ LD2R {v0.4s, v1.4s}, [x8], 8
CMEQ v2.4s, v1.4s, 0
- SQRDMULH v10.4s, v22.4s, v0.4s
- SQRDMULH v11.4s, v23.4s, v0.4s
- BIC v16.16b, v16.16b, v2.16b
- BIC v17.16b, v17.16b, v2.16b
- BIC v18.16b, v18.16b, v2.16b
- BIC v19.16b, v19.16b, v2.16b
- BIC v20.16b, v20.16b, v2.16b
- BIC v21.16b, v21.16b, v2.16b
- BIC v22.16b, v22.16b, v2.16b
- BIC v23.16b, v23.16b, v2.16b
+ BIC v4.16b, v16.16b, v2.16b
+ BIC v5.16b, v17.16b, v2.16b
+ BIC v6.16b, v18.16b, v2.16b
+ BIC v7.16b, v19.16b, v2.16b
- SSRA v4.4s, v16.4s, 31 // signed shift right accumulate
- SSRA v5.4s, v17.4s, 31
- SSRA v6.4s, v18.4s, 31
- SSRA v7.4s, v19.4s, 31
- SSRA v8.4s, v20.4s, 31
- SSRA v9.4s, v21.4s, 31
- SSRA v10.4s, v22.4s, 31
- SSRA v11.4s, v23.4s, 31
+ SQRDMULH v16.4s, v16.4s, v0.4s
+ SQRDMULH v17.4s, v17.4s, v0.4s
+ SQRDMULH v18.4s, v18.4s, v0.4s
+ SQRDMULH v19.4s, v19.4s, v0.4s
- SQRDMULH v16.4s, v24.4s, v0.4s
- SQRDMULH v17.4s, v25.4s, v0.4s
- SQRDMULH v18.4s, v26.4s, v0.4s
- SQRDMULH v19.4s, v27.4s, v0.4s
- SQRDMULH v20.4s, v28.4s, v0.4s
- SQRDMULH v21.4s, v29.4s, v0.4s
- SQRDMULH v22.4s, v30.4s, v0.4s
- SQRDMULH v23.4s, v31.4s, v0.4s
+ SSRA v16.4s, v4.4s, 31 // signed shift right accumulate
+ SSRA v17.4s, v5.4s, 31
+ SSRA v18.4s, v6.4s, 31
+ SSRA v19.4s, v7.4s, 31
- BIC v24.16b, v24.16b, v2.16b
- BIC v25.16b, v25.16b, v2.16b
- BIC v26.16b, v26.16b, v2.16b
- BIC v27.16b, v27.16b, v2.16b
- BIC v28.16b, v28.16b, v2.16b
- BIC v29.16b, v29.16b, v2.16b
- BIC v30.16b, v30.16b, v2.16b
- BIC v31.16b, v31.16b, v2.16b
+ BIC v4.16b, v20.16b, v2.16b
+ BIC v5.16b, v21.16b, v2.16b
+ BIC v6.16b, v22.16b, v2.16b
+ BIC v7.16b, v23.16b, v2.16b
- SSRA v16.4s, v24.4s, 31
- SSRA v17.4s, v25.4s, 31
- SSRA v18.4s, v26.4s, 31
- SSRA v19.4s, v27.4s, 31
- SSRA v20.4s, v28.4s, 31
- SSRA v21.4s, v29.4s, 31
- SSRA v22.4s, v30.4s, 31
- SSRA v23.4s, v31.4s, 31
+ SQRDMULH v20.4s, v20.4s, v0.4s
+ SQRDMULH v21.4s, v21.4s, v0.4s
+ SQRDMULH v22.4s, v22.4s, v0.4s
+ SQRDMULH v23.4s, v23.4s, v0.4s
- SRSHL v4.4s, v4.4s, v1.4s // signed rounding shift left
- SRSHL v5.4s, v5.4s, v1.4s
- SRSHL v6.4s, v6.4s, v1.4s
- SRSHL v7.4s, v7.4s, v1.4s
- SRSHL v8.4s, v8.4s, v1.4s
- SRSHL v9.4s, v9.4s, v1.4s
- SRSHL v10.4s, v10.4s, v1.4s
- SRSHL v11.4s, v11.4s, v1.4s
+ SSRA v20.4s, v4.4s, 31
+ SSRA v21.4s, v5.4s, 31
+ SSRA v22.4s, v6.4s, 31
+ SSRA v23.4s, v7.4s, 31
- SRSHL v16.4s, v16.4s, v1.4s
+ BIC v4.16b, v24.16b, v2.16b
+ BIC v5.16b, v25.16b, v2.16b
+ BIC v6.16b, v26.16b, v2.16b
+ BIC v7.16b, v27.16b, v2.16b
+
+ SQRDMULH v24.4s, v24.4s, v0.4s
+ SQRDMULH v25.4s, v25.4s, v0.4s
+ SQRDMULH v26.4s, v26.4s, v0.4s
+ SQRDMULH v27.4s, v27.4s, v0.4s
+
+ SSRA v24.4s, v4.4s, 31
+ SSRA v25.4s, v5.4s, 31
+ SSRA v26.4s, v6.4s, 31
+ SSRA v27.4s, v7.4s, 31
+
+ BIC v4.16b, v28.16b, v2.16b
+ BIC v5.16b, v29.16b, v2.16b
+ BIC v6.16b, v30.16b, v2.16b
+ BIC v7.16b, v31.16b, v2.16b
+
+ SQRDMULH v28.4s, v28.4s, v0.4s
+ SQRDMULH v29.4s, v29.4s, v0.4s
+ SQRDMULH v30.4s, v30.4s, v0.4s
+ SQRDMULH v31.4s, v31.4s, v0.4s
+
+ SSRA v28.4s, v4.4s, 31
+ SSRA v29.4s, v5.4s, 31
+ SSRA v30.4s, v6.4s, 31
+ SSRA v31.4s, v7.4s, 31
+
+ SRSHL v16.4s, v16.4s, v1.4s // signed rounding shift left
SRSHL v17.4s, v17.4s, v1.4s
SRSHL v18.4s, v18.4s, v1.4s
SRSHL v19.4s, v19.4s, v1.4s
@@ -507,57 +505,64 @@
SRSHL v21.4s, v21.4s, v1.4s
SRSHL v22.4s, v22.4s, v1.4s
SRSHL v23.4s, v23.4s, v1.4s
+ SRSHL v24.4s, v24.4s, v1.4s
+ SRSHL v25.4s, v25.4s, v1.4s
+ SRSHL v26.4s, v26.4s, v1.4s
+ SRSHL v27.4s, v27.4s, v1.4s
+ SRSHL v28.4s, v28.4s, v1.4s
+ SRSHL v29.4s, v29.4s, v1.4s
+ SRSHL v30.4s, v30.4s, v1.4s
+ SRSHL v31.4s, v31.4s, v1.4s
- SQXTN v4.4h, v4.4s
- SQXTN v5.4h, v5.4s
- SQXTN v6.4h, v6.4s
- SQXTN v7.4h, v7.4s
SQXTN v16.4h, v16.4s
SQXTN v17.4h, v17.4s
SQXTN v18.4h, v18.4s
SQXTN v19.4h, v19.4s
- LD1R {v2.8h}, [x8], 2 // add bias
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v2.8h}, [x8], 2 // add bias
- SQXTN2 v4.8h, v8.4s
- SQXTN2 v5.8h, v9.4s
- SQXTN2 v6.8h, v10.4s
- SQXTN2 v7.8h, v11.4s
SQXTN2 v16.8h, v20.4s
SQXTN2 v17.8h, v21.4s
SQXTN2 v18.8h, v22.4s
SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
- SQADD v4.8h, v4.8h, v2.8h
- SQADD v5.8h, v5.8h, v2.8h
- SQADD v6.8h, v6.8h, v2.8h
- SQADD v7.8h, v7.8h, v2.8h
SQADD v16.8h, v16.8h, v2.8h
SQADD v17.8h, v17.8h, v2.8h
SQADD v18.8h, v18.8h, v2.8h
SQADD v19.8h, v19.8h, v2.8h
- LD1R {v0.16b}, [x8], 1 // clamp min value
+ SQADD v24.8h, v24.8h, v2.8h
+ SQADD v25.8h, v25.8h, v2.8h
+ SQADD v26.8h, v26.8h, v2.8h
+ SQADD v27.8h, v27.8h, v2.8h
+ LD1R {v0.16b}, [x8], 1 // clamp min value
- SQXTN v4.8b, v4.8h
- SQXTN v5.8b, v5.8h
- SQXTN v6.8b, v6.8h
- SQXTN v7.8b, v7.8h
- LD1R {v1.16b}, [x8] // clamp max value
- SQXTN2 v4.16b, v16.8h
- SQXTN2 v5.16b, v17.8h
- SQXTN2 v6.16b, v18.8h
- SQXTN2 v7.16b, v19.8h
+ SQXTN v4.8b, v16.8h
+ SQXTN v5.8b, v17.8h
+ SQXTN v6.8b, v18.8h
+ SQXTN v7.8b, v19.8h
+ LD1R {v1.16b}, [x8] // clamp max value
+ SQXTN2 v4.16b, v24.8h
+ SQXTN2 v5.16b, v25.8h
+ SQXTN2 v6.16b, v26.8h
+ SQXTN2 v7.16b, v27.8h
SUB x8, x8, 11 // rewind params pointer
-
- SMAX v4.16b, v4.16b, v0.16b
- SMAX v5.16b, v5.16b, v0.16b
- LDR x0, [sp, 32] // Load cn_stride
- SMAX v6.16b, v6.16b, v0.16b
- SMAX v7.16b, v7.16b, v0.16b
+ SMAX v4.16b, v4.16b, v0.16b
+ SMAX v5.16b, v5.16b, v0.16b
+ LDR x0, [sp, 32] // cn_stride
+ SMAX v6.16b, v6.16b, v0.16b
+ SMAX v7.16b, v7.16b, v0.16b
SUBS x1, x1, 16
- SMIN v4.16b, v4.16b, v1.16b
- SMIN v5.16b, v5.16b, v1.16b
- SMIN v6.16b, v6.16b, v1.16b
- SMIN v7.16b, v7.16b, v1.16b
+ SMIN v4.16b, v4.16b, v1.16b
+ SMIN v5.16b, v5.16b, v1.16b
+ SMIN v6.16b, v6.16b, v1.16b
+ SMIN v7.16b, v7.16b, v1.16b
B.LO 7f
# Store full 4 x 16
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
index d41ebba..99d43ee 100644
--- a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
+++ b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld128.S
@@ -9,6 +9,7 @@
#include <xnnpack/assembly.h>
+
# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128(
# size_t mr, x0
# size_t nc, x1
@@ -21,7 +22,7 @@
# size_t cn_stride, [sp] -> x10
# size_t a_offset, [sp + 8] -> x11
# const float* zero, [sp + 16] -> x12
-# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+# const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
@@ -276,6 +277,7 @@
SRSHL v30.4s, v30.4s, v1.4s
SRSHL v31.4s, v31.4s, v1.4s
+
SQXTN v16.4h, v16.4s
SQXTN v17.4h, v17.4s
SQXTN v18.4h, v18.4s
@@ -284,7 +286,7 @@
SQXTN v25.4h, v25.4s
SQXTN v26.4h, v26.4s
SQXTN v27.4h, v27.4s
- LD1R {v2.8h}, [x8], 2 // add bias
+ LD1R {v2.8h}, [x8], 2 // add bias
SQXTN2 v16.8h, v20.4s
SQXTN2 v17.8h, v21.4s
@@ -303,13 +305,13 @@
SQADD v25.8h, v25.8h, v2.8h
SQADD v26.8h, v26.8h, v2.8h
SQADD v27.8h, v27.8h, v2.8h
- LD1R {v0.16b}, [x8], 1 // clamp min value
+ LD1R {v0.16b}, [x8], 1 // clamp min value
SQXTN v4.8b, v16.8h
SQXTN v5.8b, v17.8h
SQXTN v6.8b, v18.8h
SQXTN v7.8b, v19.8h
- LD1R {v1.16b}, [x8] // clamp max value
+ LD1R {v1.16b}, [x8] // clamp max value
SQXTN2 v4.16b, v24.8h
SQXTN2 v5.16b, v25.8h
SQXTN2 v6.16b, v26.8h
diff --git a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
index 48b995e..a188893 100644
--- a/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
+++ b/src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-aarch64-neondot-ld64.S
@@ -9,6 +9,7 @@
#include <xnnpack/assembly.h>
+
# void xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64(
# size_t mr, x0
# size_t nc, x1
@@ -21,7 +22,7 @@
# size_t cn_stride, [sp] -> x10
# size_t a_offset, [sp + 8] -> x11
# const float* zero, [sp + 16] -> x12
-# const xnn_qs8_conv_minmax_params params [sp + 24] -> (x8)
+# const union xnn_qs8_minmax_params params [sp + 24] -> (x8)
# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index d13693b..aa9062c 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -648,8 +648,6 @@
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32)
@@ -657,7 +655,12 @@
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld64)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 4d17ea9..6cc5ceb 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -462,6 +462,10 @@
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55)
+
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x4c2__sse2_ld64)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x4c2__sse2_ld64)
diff --git a/test/qs8-gemm-minmax-fp32.cc b/test/qs8-gemm-minmax-fp32.cc
index 553a0e0..36a154b 100644
--- a/test/qs8-gemm-minmax-fp32.cc
+++ b/test/qs8-gemm-minmax-fp32.cc
@@ -23,6 +23,1830 @@
#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(4)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(4)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(4)
+ .a_stride(7)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(7)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(4)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(4)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(4)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_1X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(4)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(4)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(4)
+ .a_stride(7)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(7)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(4)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(4)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(4)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
TEST_REQUIRES_ARM_NEON_DOT;
GemmMicrokernelTester()
@@ -478,6 +2302,462 @@
#endif // XNN_ARCH_ARM64
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(37)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(163)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QS8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8) {
TEST_REQUIRES_ARM_NEON;
diff --git a/test/qs8-gemm-minmax-fp32.yaml b/test/qs8-gemm-minmax-fp32.yaml
index d489b28..37cbbb0 100644
--- a/test/qs8-gemm-minmax-fp32.yaml
+++ b/test/qs8-gemm-minmax-fp32.yaml
@@ -3,9 +3,24 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld32
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 4
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x16c4__aarch64_neondot_ld64
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 8
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld32
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 4
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 8
- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
k-block: 16
+- name: xnn_qs8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 16
- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane
init: xnn_init_qs8_conv_minmax_fp32_neon_params
k-block: 8
diff --git a/test/qs8-gemm-minmax-gemmlowp.cc b/test/qs8-gemm-minmax-gemmlowp.cc
index e724e64..bf6466b 100644
--- a/test/qs8-gemm-minmax-gemmlowp.cc
+++ b/test/qs8-gemm-minmax-gemmlowp.cc
@@ -23,6 +23,2742 @@
#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(4)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(4)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(4)
+ .a_stride(7)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(7)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(4)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(4)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(4)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(4)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(4)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(4)
+ .a_stride(7)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(4)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 4; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(7)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 4; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 5; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 5; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 8; k <= 40; k += 4) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 8; k <= 40; k += 4) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(23)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 20; k += 5) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(4)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(4)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(4)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(37)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(163)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(37)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(163)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
@@ -40150,2742 +42886,6 @@
#endif // XNN_ARCH_ARM64
-#if XNN_ARCH_ARM64
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(4)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(4)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(4)
- .a_stride(7)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(4)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(16)
- .k(4)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(n)
- .k(4)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 4; k++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 4; k++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .a_stride(7)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 4; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 5; k < 8; k++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 5; k < 8; k++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 5; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 8; k <= 40; k += 4) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 8; k <= 40; k += 4) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 8; k <= 40; k += 4) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 20; k += 5) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 20; k += 5) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 20; k += 5) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(23)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 20; k += 5) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 20; k += 5) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 20; k += 5) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 20; k += 5) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(23)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 20; k += 5) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 20; k += 5) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(19)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, qmin) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(4)
- .qmin(128)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, qmax) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(4)
- .qmax(128)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(4)
- .cm_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(8)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(8)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(16)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(19)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, qmin) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(8)
- .qmin(128)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, qmax) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(8)
- .qmax(128)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(1)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(1)
- .n(16)
- .k(8)
- .cm_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(4)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(4)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(4)
- .a_stride(7)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(4)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_m) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(16)
- .k(4)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_eq_4_subtile_n) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(4)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 4; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 4; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .a_stride(7)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_lt_4_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 4; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 5; k < 8; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 5; k < 8; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_gt_4_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 5; k < 8; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_div_4) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 8; k <= 40; k += 4) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 8; k <= 40; k += 4) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, k_div_4_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 8; k <= 40; k += 4) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 20; k += 5) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 20; k += 5) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 20; k += 5) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(23)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_gt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 20; k += 5) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 20; k += 5) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 20; k += 5) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 20; k += 5) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(23)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, n_div_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 20; k += 5) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, strided_cm_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 20; k += 5) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(19)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, qmin) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(4)
- .qmin(128)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, qmax) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(4)
- .qmax(128)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD32, strided_cm) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(4)
- .cm_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(8)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(8)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(16)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(19)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(8)
- .qmin(128)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(8)
- .qmax(128)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(8)
- .cm_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .a_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(16)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(16)
- .k(16)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(16)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 16; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 16; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 16; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 17; k < 32; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 17; k < 32; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .a_stride(37)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 17; k < 32; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 32; k <= 160; k += 16) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 32; k <= 160; k += 16) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .a_stride(163)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 32; k <= 160; k += 16) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 80; k += 17) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 80; k += 17) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 80; k += 17) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(19)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .qmin(128)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .qmax(128)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .cm_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .a_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(16)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(16)
- .k(16)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(16)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 16; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 16; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 16; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 17; k < 32; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 17; k < 32; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .a_stride(37)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 17; k < 32; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 32; k <= 160; k += 16) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 32; k <= 160; k += 16) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .a_stride(163)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 32; k <= 160; k += 16) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 80; k += 17) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_a) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 80; k += 17) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 80; k += 17) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(19)
- .iterations(1)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .qmin(128)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .qmax(128)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .cm_stride(19)
- .Test(xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_ARM64
-
-
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X4C2__SSE2_LD64, k_eq_8) {
TEST_REQUIRES_X86_SSE2;
diff --git a/test/qs8-gemm-minmax-gemmlowp.yaml b/test/qs8-gemm-minmax-gemmlowp.yaml
index a700b01..878e34a 100644
--- a/test/qs8-gemm-minmax-gemmlowp.yaml
+++ b/test/qs8-gemm-minmax-gemmlowp.yaml
@@ -3,6 +3,24 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32
+ init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+ k-block: 4
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64
+ init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+ k-block: 8
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32
+ init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+ k-block: 4
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+ init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+ k-block: 8
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
+ init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+ k-block: 16
+- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
+ init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+ k-block: 16
- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
k-block: 8
@@ -267,24 +285,6 @@
- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c16__aarch64_neon_mlal_padal
init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld32
- init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
- k-block: 4
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c4__aarch64_neondot_ld64
- init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
- k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld32
- init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
- k-block: 4
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
- init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
- k-block: 8
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
- init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
- k-block: 16
-- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
- init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
- k-block: 16
- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64
init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
k-block: 8
diff --git a/test/qs8-igemm-minmax-fp32.cc b/test/qs8-igemm-minmax-fp32.cc
index 8f6b855..4db5f4e 100644
--- a/test/qs8-igemm-minmax-fp32.cc
+++ b/test/qs8-igemm-minmax-fp32.cc
@@ -22,6 +22,1410 @@
#include "gemm-microkernel-tester.h"
+#if XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, a_offset) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, zero) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, a_offset) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, zero) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, a_offset) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, zero) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM64
+
+
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8) {
TEST_REQUIRES_ARM_NEON;
diff --git a/test/qs8-igemm-minmax-fp32.yaml b/test/qs8-igemm-minmax-fp32.yaml
index 24f7599..7861808 100644
--- a/test/qs8-igemm-minmax-fp32.yaml
+++ b/test/qs8-igemm-minmax-fp32.yaml
@@ -3,6 +3,15 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 16
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld64
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 16
- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane
init: xnn_init_qs8_conv_minmax_fp32_neon_params
k-block: 8
diff --git a/test/qs8-igemm-minmax-gemmlowp.cc b/test/qs8-igemm-minmax-gemmlowp.cc
index 590d5e9..fb8862e 100644
--- a/test/qs8-igemm-minmax-gemmlowp.cc
+++ b/test/qs8-igemm-minmax-gemmlowp.cc
@@ -491,6 +491,942 @@
#if XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, a_offset) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, zero) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, a_offset) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, zero) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16__AARCH64_NEON_MLAL_LANE_PRFM_CORTEX_A53, k_eq_8) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
@@ -41206,942 +42142,6 @@
#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
-#if XNN_ARCH_ARM64
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(8)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(8)
- .cn_stride(19)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_m) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(16)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_eq_8_subtile_n) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_lt_8_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_gt_8_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, k_div_8_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, small_kernel) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .ks(3)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, small_kernel_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .ks(3)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_gt_16_small_kernel) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .ks(3)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, n_div_16_small_kernel) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .ks(3)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(19)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, a_offset) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .ks(3)
- .a_offset(163)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, zero) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t mz = 0; mz < 4; mz++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .ks(3)
- .a_offset(163)
- .zero_index(mz)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmin) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(8)
- .qmin(128)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, qmax) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(8)
- .qmax(128)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD64, strided_cm) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(8)
- .cm_stride(19)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM64
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .cn_stride(19)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(16)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(16)
- .k(16)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(16)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 16; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k < 16; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 17; k < 32; k++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 17; k < 32; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 32; k <= 160; k += 16) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 32; k <= 160; k += 16) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 80; k += 17) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .cn_stride(19)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 80; k += 17) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, small_kernel) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .ks(3)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, small_kernel_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 80; k += 17) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .ks(3)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_small_kernel) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 17; n < 32; n++) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .ks(3)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_small_kernel) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t n = 32; n <= 48; n += 16) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .ks(3)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 80; k += 17) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 16; n++) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(19)
- .iterations(1)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, a_offset) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .ks(3)
- .a_offset(331)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, zero) {
- TEST_REQUIRES_ARM_NEON_DOT;
- for (uint32_t mz = 0; mz < 4; mz++) {
- for (size_t k = 1; k <= 80; k += 17) {
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(k)
- .ks(3)
- .a_offset(331)
- .zero_index(mz)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .qmin(128)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .qmax(128)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_IGEMM_MINMAX_GEMMLOWP_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
- TEST_REQUIRES_ARM_NEON_DOT;
- GemmMicrokernelTester()
- .mr(4)
- .nr(16)
- .kr(4)
- .sr(1)
- .m(4)
- .n(16)
- .k(16)
- .cm_stride(19)
- .Test(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qs8_conv_minmax_gemmlowp_neon_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_ARM64
-
-
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QS8_IGEMM_MINMAX_GEMMLOWP_1X4C2__SSE2_LD64, k_eq_8) {
TEST_REQUIRES_X86_SSE2;
diff --git a/test/qs8-igemm-minmax-gemmlowp.yaml b/test/qs8-igemm-minmax-gemmlowp.yaml
index 86c7eb5..348ae0e 100644
--- a/test/qs8-igemm-minmax-gemmlowp.yaml
+++ b/test/qs8-igemm-minmax-gemmlowp.yaml
@@ -6,6 +6,12 @@
- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_cortex_a55
init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
k-block: 16
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
+ init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
+ init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
+ k-block: 16
- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53
init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
k-block: 8
@@ -267,12 +273,6 @@
- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_8x16c4__neondot
init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld64
- init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
- k-block: 8
-- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16c4__aarch64_neondot_ld128
- init: xnn_init_qs8_conv_minmax_gemmlowp_neon_params
- k-block: 16
- name: xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x4c2__sse2_ld64
init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
k-block: 8