NEON QS8 IGEMM microkernels with FP32 requantization
PiperOrigin-RevId: 381645889
diff --git a/BUILD.bazel b/BUILD.bazel
index 8e8f41e..27f1244 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1822,10 +1822,12 @@
"src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c",
"src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mull-padal-dup.c",
"src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c",
+ "src/qs8-igemm/gen/1x8c8-minmax-fp32-neon-mlal-padal.c",
"src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c",
"src/qs8-igemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c",
"src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c",
"src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane.c",
+ "src/qs8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c",
"src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mull-addw-dup.c",
"src/qs8-igemm/gen/1x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c",
"src/qs8-igemm/gen/1x16c2-minmax-gemmlowp-neon-mull-padal-dup.c",
@@ -1838,6 +1840,7 @@
"src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c",
"src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c",
"src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c",
+ "src/qs8-igemm/gen/2x8c8-minmax-fp32-neon-mlal-padal.c",
"src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c",
"src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-neon-mlal-padal.c",
"src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c",
@@ -1874,6 +1877,7 @@
"src/qs8-igemm/gen/4x8c16-minmax-gemmlowp-neon-mlal-padal.c",
"src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane-prfm.c",
"src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane.c",
+ "src/qs8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c",
"src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mull-addw-dup.c",
"src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c",
"src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mull-padal-dup.c",
@@ -2245,10 +2249,6 @@
]
NEONV8_UKERNELS = [
- "src/qs8-gemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c",
- "src/qs8-gemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c",
- "src/qs8-gemm/gen/1x8c8-minmax-fp32-neonv8-mlal-padal.c",
- "src/qs8-gemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c",
"src/qs8-dwconv/gen/up8x9-minmax-fp32-neonv8-mul16.c",
"src/qs8-dwconv/gen/up8x25-minmax-fp32-neonv8-mul16.c",
"src/qs8-dwconv/gen/up16x9-minmax-fp32-neonv8-mul16.c",
@@ -2257,6 +2257,14 @@
"src/qs8-dwconv/gen/up24x25-minmax-fp32-neonv8-mul16.c",
"src/qs8-dwconv/gen/up32x9-minmax-fp32-neonv8-mul16.c",
"src/qs8-dwconv/gen/up32x25-minmax-fp32-neonv8-mul16.c",
+ "src/qs8-gemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c",
+ "src/qs8-gemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c",
+ "src/qs8-gemm/gen/1x8c8-minmax-fp32-neonv8-mlal-padal.c",
+ "src/qs8-gemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c",
+ "src/qs8-igemm/gen/1x8c8-minmax-fp32-neonv8-mlal-padal.c",
+ "src/qs8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c",
+ "src/qs8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c",
+ "src/qs8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c",
"src/f32-vrnd/gen/vrndd-neonv8-x4.c",
"src/f32-vrnd/gen/vrndd-neonv8-x8.c",
"src/f32-vrnd/gen/vrndne-neonv8-x4.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2693351..d21d2c4 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1069,10 +1069,12 @@
src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
+ src/qs8-igemm/gen/1x8c8-minmax-fp32-neon-mlal-padal.c
src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
src/qs8-igemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c
src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane.c
+ src/qs8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c
src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mull-addw-dup.c
src/qs8-igemm/gen/1x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
src/qs8-igemm/gen/1x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -1083,6 +1085,7 @@
src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane.c
src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mull-addw-dup.c
src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
+ src/qs8-igemm/gen/2x8c2-minmax-fp32-neon-mlal-padal-dup.c
src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -1121,6 +1124,7 @@
src/qs8-igemm/gen/4x8c16-minmax-gemmlowp-neon-mlal-padal.c
src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane.c
+ src/qs8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c
src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mull-addw-dup.c
src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mull-padal-dup.c
@@ -1387,6 +1391,10 @@
src/qs8-gemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c
src/qs8-gemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c
src/qs8-gemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
+ src/qs8-igemm/gen/1x8c8-minmax-fp32-neonv8-mlal-padal.c
+ src/qs8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c
+ src/qs8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c
+ src/qs8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
src/f32-vrnd/gen/vrndd-neonv8-x4.c
src/f32-vrnd/gen/vrndd-neonv8-x8.c
src/f32-vrnd/gen/vrndne-neonv8-x4.c
diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh
index f405a05..7858d3c 100755
--- a/scripts/generate-qs8-igemm.sh
+++ b/scripts/generate-qs8-igemm.sh
@@ -26,27 +26,33 @@
tools/xngen src/qs8-igemm/MRx4c8-wasmsimd.c.in -D MR=3 -D VARIANT=LD128 -o src/qs8-igemm/gen/3x4c8-minmax-gemmlowp-wasmsimd-ld128.c
################################### ARM NEON ##################################
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=8 -D PREFETCH=0 -o src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=2 -D NR=8 -D PREFETCH=0 -o src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=3 -D NR=8 -D PREFETCH=0 -o src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=8 -D PREFETCH=0 -o src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=6 -D NR=8 -D PREFETCH=0 -o src/qs8-igemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=16 -D PREFETCH=0 -o src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=2 -D NR=16 -D PREFETCH=0 -o src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=3 -D NR=16 -D PREFETCH=0 -o src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=16 -D PREFETCH=0 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=6 -D NR=16 -D PREFETCH=0 -o src/qs8-igemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=2 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=3 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=6 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=2 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=3 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=6 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=8 -D PREFETCH=1 -o src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=2 -D NR=8 -D PREFETCH=1 -o src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=3 -D NR=8 -D PREFETCH=1 -o src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=8 -D PREFETCH=1 -o src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=6 -D NR=8 -D PREFETCH=1 -o src/qs8-igemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=16 -D PREFETCH=1 -o src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=2 -D NR=16 -D PREFETCH=1 -o src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=3 -D NR=16 -D PREFETCH=1 -o src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=16 -D PREFETCH=1 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
-tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=6 -D NR=16 -D PREFETCH=1 -o src/qs8-igemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D ARMV8=0 -o src/qs8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D ARMV8=0 -o src/qs8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c
+
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D ARMV8=1 -o src/qs8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D ARMV8=1 -o src/qs8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
+
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=8 -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=2 -D NR=8 -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=3 -D NR=8 -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/3x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=8 -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/4x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=6 -D NR=8 -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/6x8-minmax-gemmlowp-neon-mlal-lane-prfm.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=16 -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=2 -D NR=16 -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=3 -D NR=16 -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/3x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=16 -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/4x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=6 -D NR=16 -D PREFETCH=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/6x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
tools/xngen src/qs8-igemm/neon-mull-addw-dup.c.in -D MR=1 -D NR=8 -o src/qs8-igemm/gen/1x8-minmax-gemmlowp-neon-mull-addw-dup.c
tools/xngen src/qs8-igemm/neon-mull-addw-dup.c.in -D MR=2 -D NR=8 -o src/qs8-igemm/gen/2x8-minmax-gemmlowp-neon-mull-addw-dup.c
@@ -78,23 +84,29 @@
tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MR=4 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/4x16c2-minmax-gemmlowp-neon-mlal-padal-dup.c
### C8 micro-kernels
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=8 -D MLA=0 -o src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=8 -D MLA=0 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=8 -D MLA=0 -o src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=8 -D MLA=0 -o src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=16 -D MLA=0 -o src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=16 -D MLA=0 -o src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=16 -D MLA=0 -o src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=16 -D MLA=0 -o src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=8 -D MLA=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=8 -D MLA=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=8 -D MLA=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=8 -D MLA=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=16 -D MLA=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=16 -D MLA=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=16 -D MLA=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=16 -D MLA=0 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=8 -D MLA=1 -o src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=8 -D MLA=1 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=8 -D MLA=1 -o src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=8 -D MLA=1 -o src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
-tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=16 -D MLA=1 -o src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=8 -D MLA=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=8 -D MLA=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=8 -D MLA=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=8 -D MLA=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=16 -D MLA=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=16 -D MLA=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=16 -D MLA=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=16 -D MLA=1 -D REQUANTIZATION=GEMMLOWP -D ARMV8=0 -o src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
+
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=8 -D MLA=1 -D REQUANTIZATION=FP32 -D ARMV8=0 -o src/qs8-igemm/gen/1x8c8-minmax-fp32-neon-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=8 -D MLA=1 -D REQUANTIZATION=FP32 -D ARMV8=0 -o src/qs8-igemm/gen/2x8c8-minmax-fp32-neon-mlal-padal.c
+
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=1 -D NR=8 -D MLA=1 -D REQUANTIZATION=FP32 -D ARMV8=1 -o src/qs8-igemm/gen/1x8c8-minmax-fp32-neonv8-mlal-padal.c
+tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=2 -D NR=8 -D MLA=1 -D REQUANTIZATION=FP32 -D ARMV8=1 -o src/qs8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c
### C16 micro-kernels
tools/xngen src/qs8-igemm/c16-neon-mlal-padal.c.in -D MR=1 -D NR=8 -o src/qs8-igemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c
diff --git a/src/qs8-igemm/c8-neon-mull-padal.c.in b/src/qs8-igemm/c8-neon-mull-padal.c.in
index 75ddbe3..9dbc412 100644
--- a/src/qs8-igemm/c8-neon-mull-padal.c.in
+++ b/src/qs8-igemm/c8-neon-mull-padal.c.in
@@ -6,6 +6,7 @@
$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
$assert NR % 8 == 0
$assert 8 <= NR <= 16
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
#include <assert.h>
#include <arm_neon.h>
@@ -14,7 +15,9 @@
#include <xnnpack/math.h>
-void xnn_qs8_igemm_minmax_gemmlowp_ukernel_${MR}x${NR}c8__neon_${"mlal" if MLA else "mull"}_padal(
+$PARAMS_STRUCT = REQUANTIZATION.lower() + "_" + ("neonv8" if ARMV8 else "neon")
+$ISA = "neonv8" if ARMV8 else "neon"
+void xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x${NR}c8__${ISA}_${"mlal" if MLA else "mull"}_padal(
size_t mr,
size_t nc,
size_t kc,
@@ -133,78 +136,144 @@
int32x4_t vacc${M}x${ABC[N:N+4]} = vcombine_s32(vsum${M}x${ABC[N:N+2]}, vsum${M}x${ABC[N+2:N+4]} );
#endif
- const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->gemmlowp_neon.multiplier);
- $for M in range(MR):
- $for N in range(0, NR, 4):
- vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
+ $if REQUANTIZATION == "GEMMLOWP":
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->${PARAMS_STRUCT}.multiplier);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
- const int32x4_t vright_shift = vld1q_dup_s32(¶ms->gemmlowp_neon.right_shift);
- const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
- $for M in range(MR):
- $for N in range(0, NR, 4):
- vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->${PARAMS_STRUCT}.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
- $for M in range(MR):
- $for N in range(0, NR, 4):
- vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+ $elif REQUANTIZATION == "FP32":
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ float32x4_t vfpacc${M}x${ABC[N:N+4]} = vcvtq_f32_s32(vacc${M}x${ABC[N:N+4]});
- const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->gemmlowp_neon.output_zero_point);
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.scale);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vfpacc${M}x${ABC[N:N+4]} = vmulq_f32(vfpacc${M}x${ABC[N:N+4]}, vscale);
+
+ $if ARMV8:
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vcvtnq_s32_f32(vfpacc${M}x${ABC[N:N+4]});
+ $else:
+ const float32x4_t voutput_min_less_zero_point = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.output_min_less_zero_point);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vfpacc${M}x${ABC[N:N+4]} = vmaxq_f32(vfpacc${M}x${ABC[N:N+4]}, voutput_min_less_zero_point);
+
+ const float32x4_t voutput_max_less_zero_point = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.output_max_less_zero_point);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vfpacc${M}x${ABC[N:N+4]} = vminq_f32(vfpacc${M}x${ABC[N:N+4]}, voutput_max_less_zero_point);
+
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.magic_bias);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${M}x${ABC[N:N+4]}, vmagic_bias));
+
+ const int32x4_t vmagic_bias_less_zero_point = vld1q_dup_s32(¶ms->${PARAMS_STRUCT}.magic_bias_less_zero_point);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vsubq_s32(vacc${M}x${ABC[N:N+4]}, vmagic_bias_less_zero_point);
+
+ $if REQUANTIZATION != "FP32" or ARMV8:
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->${PARAMS_STRUCT}.output_zero_point);
#if XNN_ARCH_ARM64
- $for M in range(MR):
- $for N in range(0, NR, 8):
- const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
- $for M in range(MR):
- $for N in range(0, NR, 16):
- $if N + 8 < NR:
- int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
- $elif M % 2 == 1:
- int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
- $elif M + 1 == MR:
- int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
-#else
- $for M in range(MR):
- $for N in range(0, NR, 8):
- const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
+ $if REQUANTIZATION == "FP32" and not ARMV8:
+ $for M in range(MR):
+ $for N in range(0, NR, 8):
+ const int16x8_t vacc${M}x${ABC[N:N+8]} = vmovn_high_s32(vmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]});
- $for M in range(MR):
- $for N in range(0, NR, 16):
- $if N + 8 < NR:
- int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
- $elif M % 2 == 1:
- int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
- $elif M + 1 == MR:
- int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
-#endif
- $if NR == 8 and MR == 1:
- const int8x8_t voutput_min = vld1_dup_s8(¶ms->gemmlowp_neon.output_min);
- const int8x8_t voutput_max = vld1_dup_s8(¶ms->gemmlowp_neon.output_max);
+ $for M in range(MR):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ int8x16_t vout${M}x${ABC[N:N+16]} = vmovn_high_s16(vmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+ $elif M % 2 == 1:
+ int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmovn_high_s16(vmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+ $elif M + 1 == MR:
+ int8x8_t vout${M}x${ABC[N:N+8]} = vmovn_s16(vacc${M}x${ABC[N:N+8]});
$else:
- const int8x16_t voutput_min = vld1q_dup_s8(¶ms->gemmlowp_neon.output_min);
- const int8x16_t voutput_max = vld1q_dup_s8(¶ms->gemmlowp_neon.output_max);
+ $for M in range(MR):
+ $for N in range(0, NR, 8):
+ const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
- $for M in reversed(range(MR)):
- $for N in range(0, NR, 16):
- $if N + 8 < NR:
- vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
- $elif M % 2 == 1:
- vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
- $elif M + 1 == MR:
- $if NR == 8 and MR == 1:
- vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
- $else:
- vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+ $for M in range(MR):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+ $elif M % 2 == 1:
+ int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+ $elif M + 1 == MR:
+ int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#else
+ $if REQUANTIZATION == "FP32" and not ARMV8:
+ $for M in range(MR):
+ $for N in range(0, NR, 8):
+ const int16x8_t vacc${M}x${ABC[N:N+8]} = vcombine_s16(vmovn_s32(vacc${M}x${ABC[N:N+4]}), vmovn_s32(vacc${M}x${ABC[N+4:N+8]}));
- $for M in reversed(range(MR)):
- $for N in range(0, NR, 16):
- $if N + 8 < NR:
- vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
- $elif M % 2 == 1:
- vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
- $elif M + 1 == MR:
- $if NR == 8 and MR == 1:
- vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
- $else:
- vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
+ $for M in range(MR):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vmovn_s16(vacc${M}x${ABC[N:N+8]}), vmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+ $elif M % 2 == 1:
+ int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vmovn_s16(vacc${M}x${ABC[N:N+8]}));
+ $elif M + 1 == MR:
+ int8x8_t vout${M}x${ABC[N:N+8]} = vmovn_s16(vacc${M}x${ABC[N:N+8]});
+ $else:
+ $for M in range(MR):
+ $for N in range(0, NR, 8):
+ const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
+
+ $for M in range(MR):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+ $elif M % 2 == 1:
+ int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
+ $elif M + 1 == MR:
+ int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#endif
+ $if REQUANTIZATION != "FP32" or ARMV8:
+ $if NR == 8 and MR == 1:
+ const int8x8_t voutput_min = vld1_dup_s8(¶ms->${PARAMS_STRUCT}.output_min);
+ const int8x8_t voutput_max = vld1_dup_s8(¶ms->${PARAMS_STRUCT}.output_max);
+ $else:
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->${PARAMS_STRUCT}.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->${PARAMS_STRUCT}.output_max);
+
+ $for M in reversed(range(MR)):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
+ $elif M % 2 == 1:
+ vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
+ $elif M + 1 == MR:
+ $if NR == 8 and MR == 1:
+ vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
+ $else:
+ vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+
+ $for M in reversed(range(MR)):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
+ $elif M % 2 == 1:
+ vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
+ $elif M + 1 == MR:
+ $if NR == 8 and MR == 1:
+ vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
+ $else:
+ vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
if (nc >= ${NR}) {
$for M in reversed(range(MR)):
diff --git a/src/qs8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c b/src/qs8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c
new file mode 100644
index 0000000..38b07f4
--- /dev/null
+++ b/src/qs8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c
@@ -0,0 +1,314 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/neon-mlal-lane.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = kc;
+ while (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int16x8_t vxa0 = vmovl_s8(va0);
+
+ const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+
+
+ const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int16x8_t vxa0 = vmovl_s8(va0);
+
+ const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+ const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+
+ if (k >= 2 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+ const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+ const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+
+ if (k >= 4 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+ const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+ const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+
+ if (k >= 6 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+ const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+ const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc0x89AB = vcvtq_f32_s32(vacc0x89AB);
+ float32x4_t vfpacc0xCDEF = vcvtq_f32_s32(vacc0xCDEF);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc0x89AB = vmulq_f32(vfpacc0x89AB, vscale);
+ vfpacc0xCDEF = vmulq_f32(vfpacc0xCDEF, vscale);
+
+ const float32x4_t voutput_min_less_zero_point = vld1q_dup_f32(¶ms->fp32_neon.output_min_less_zero_point);
+ vfpacc0x0123 = vmaxq_f32(vfpacc0x0123, voutput_min_less_zero_point);
+ vfpacc0x4567 = vmaxq_f32(vfpacc0x4567, voutput_min_less_zero_point);
+ vfpacc0x89AB = vmaxq_f32(vfpacc0x89AB, voutput_min_less_zero_point);
+ vfpacc0xCDEF = vmaxq_f32(vfpacc0xCDEF, voutput_min_less_zero_point);
+
+ const float32x4_t voutput_max_less_zero_point = vld1q_dup_f32(¶ms->fp32_neon.output_max_less_zero_point);
+ vfpacc0x0123 = vminq_f32(vfpacc0x0123, voutput_max_less_zero_point);
+ vfpacc0x4567 = vminq_f32(vfpacc0x4567, voutput_max_less_zero_point);
+ vfpacc0x89AB = vminq_f32(vfpacc0x89AB, voutput_max_less_zero_point);
+ vfpacc0xCDEF = vminq_f32(vfpacc0xCDEF, voutput_max_less_zero_point);
+
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
+ vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));
+ vacc0x89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x89AB, vmagic_bias));
+ vacc0xCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpacc0xCDEF, vmagic_bias));
+
+ const int32x4_t vmagic_bias_less_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_zero_point);
+ vacc0x0123 = vsubq_s32(vacc0x0123, vmagic_bias_less_zero_point);
+ vacc0x4567 = vsubq_s32(vacc0x4567, vmagic_bias_less_zero_point);
+ vacc0x89AB = vsubq_s32(vacc0x89AB, vmagic_bias_less_zero_point);
+ vacc0xCDEF = vsubq_s32(vacc0xCDEF, vmagic_bias_less_zero_point);
+
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vmovn_high_s32(vmovn_s32(vacc0x0123), vacc0x4567);
+ const int16x8_t vacc0x89ABCDEF = vmovn_high_s32(vmovn_s32(vacc0x89AB), vacc0xCDEF);
+
+ int8x16_t vout0x0123456789ABCDEF = vmovn_high_s16(vmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vcombine_s16(vmovn_s32(vacc0x0123), vmovn_s32(vacc0x4567));
+ const int16x8_t vacc0x89ABCDEF = vcombine_s16(vmovn_s32(vacc0x89AB), vmovn_s32(vacc0xCDEF));
+
+ int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vmovn_s16(vacc0x01234567), vmovn_s16(vacc0x89ABCDEF));
+#endif
+
+ if (nc >= 16) {
+ vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+ if (nc & 8) {
+ vst1_s8(c0, vout0x01234567); c0 += 8;
+ vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+ }
+ if (nc & 4) {
+ vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_s8(c0, vout0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c b/src/qs8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c
new file mode 100644
index 0000000..ac98b47
--- /dev/null
+++ b/src/qs8-igemm/gen/1x16-minmax-fp32-neonv8-mlal-lane.c
@@ -0,0 +1,302 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/neon-mlal-lane.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = kc;
+ while (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int16x8_t vxa0 = vmovl_s8(va0);
+
+ const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+
+
+ const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int16x8_t vxa0 = vmovl_s8(va0);
+
+ const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+ const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+
+ if (k >= 2 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+ const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+ const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+
+ if (k >= 4 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+ const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+ const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+
+ if (k >= 6 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+ const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+ const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc0x89AB = vcvtq_f32_s32(vacc0x89AB);
+ float32x4_t vfpacc0xCDEF = vcvtq_f32_s32(vacc0xCDEF);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc0x89AB = vmulq_f32(vfpacc0x89AB, vscale);
+ vfpacc0xCDEF = vmulq_f32(vfpacc0xCDEF, vscale);
+
+ vacc0x0123 = vcvtnq_s32_f32(vfpacc0x0123);
+ vacc0x4567 = vcvtnq_s32_f32(vfpacc0x4567);
+ vacc0x89AB = vcvtnq_s32_f32(vfpacc0x89AB);
+ vacc0xCDEF = vcvtnq_s32_f32(vfpacc0xCDEF);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neonv8.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neonv8.output_max);
+
+ vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+ vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+ if (nc & 8) {
+ vst1_s8(c0, vout0x01234567); c0 += 8;
+ vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+ }
+ if (nc & 4) {
+ vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_s8(c0, vout0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
index 29bce7e..1684424 100644
--- a/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -285,6 +285,7 @@
#if XNN_ARCH_ARM64
const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
#else
const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
diff --git a/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
index 08864a6..52693ec 100644
--- a/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/1x16c8-minmax-gemmlowp-neon-mull-padal.c
@@ -196,6 +196,7 @@
#if XNN_ARCH_ARM64
const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
#else
const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-fp32-neon-mlal-padal.c b/src/qs8-igemm/gen/1x8c8-minmax-fp32-neon-mlal-padal.c
new file mode 100644
index 0000000..602d752
--- /dev/null
+++ b/src/qs8-igemm/gen/1x8c8-minmax-fp32-neon-mlal-padal.c
@@ -0,0 +1,231 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8 * sizeof(int8_t));
+ int8_t* c0 = c;
+
+ do {
+ int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = kc;
+ // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+ while (k >= 16 * sizeof(int8_t)) {
+ const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+
+ const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+
+ const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+ vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+ vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+ const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+ vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+ vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+ const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+ vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+ vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+ const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+ vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+ vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+ const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+ vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+ vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+ const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+ vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+ vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+ const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+ vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+ vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+ const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+ vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+ vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ // Handle 8 bytes at a time using MUL.
+ if (k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+ const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+ vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+ const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+ vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+ const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+ vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+ const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+ vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+ const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+ vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+ const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+ vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+ const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+ vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+ const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+ vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+#if XNN_ARCH_ARM64
+ const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+ const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+ const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+ const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+ int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+ int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+#else
+ const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+ const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+ const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+ const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+ const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+ const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+ int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+ const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+ const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+ const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+ const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+ const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+ const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+ int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+#endif
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+
+ const float32x4_t voutput_min_less_zero_point = vld1q_dup_f32(¶ms->fp32_neon.output_min_less_zero_point);
+ vfpacc0x0123 = vmaxq_f32(vfpacc0x0123, voutput_min_less_zero_point);
+ vfpacc0x4567 = vmaxq_f32(vfpacc0x4567, voutput_min_less_zero_point);
+
+ const float32x4_t voutput_max_less_zero_point = vld1q_dup_f32(¶ms->fp32_neon.output_max_less_zero_point);
+ vfpacc0x0123 = vminq_f32(vfpacc0x0123, voutput_max_less_zero_point);
+ vfpacc0x4567 = vminq_f32(vfpacc0x4567, voutput_max_less_zero_point);
+
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
+ vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));
+
+ const int32x4_t vmagic_bias_less_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_zero_point);
+ vacc0x0123 = vsubq_s32(vacc0x0123, vmagic_bias_less_zero_point);
+ vacc0x4567 = vsubq_s32(vacc0x4567, vmagic_bias_less_zero_point);
+
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vmovn_high_s32(vmovn_s32(vacc0x0123), vacc0x4567);
+
+ int8x8_t vout0x01234567 = vmovn_s16(vacc0x01234567);
+#else
+ const int16x8_t vacc0x01234567 = vcombine_s16(vmovn_s32(vacc0x0123), vmovn_s32(vacc0x4567));
+
+ int8x8_t vout0x01234567 = vmovn_s16(vacc0x01234567);
+#endif
+
+ if (nc >= 8) {
+ vst1_s8(c0 + 0, vout0x01234567);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_s8(c0, vout0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-fp32-neonv8-mlal-padal.c b/src/qs8-igemm/gen/1x8c8-minmax-fp32-neonv8-mlal-padal.c
new file mode 100644
index 0000000..99ed878
--- /dev/null
+++ b/src/qs8-igemm/gen/1x8c8-minmax-fp32-neonv8-mlal-padal.c
@@ -0,0 +1,225 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8 * sizeof(int8_t));
+ int8_t* c0 = c;
+
+ do {
+ int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = kc;
+ // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+ while (k >= 16 * sizeof(int8_t)) {
+ const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+
+ const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+
+ const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+ vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+ vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+ const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+ vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+ vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+ const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+ vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+ vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+ const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+ vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+ vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+ const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+ vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+ vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+ const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+ vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+ vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+ const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+ vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+ vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+ const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+ vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+ vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ // Handle 8 bytes at a time using MUL.
+ if (k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+ const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+ vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+ const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+ vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+ const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+ vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+ const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+ vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+ const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+ vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+ const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+ vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+ const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+ vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+ const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+ vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+#if XNN_ARCH_ARM64
+ const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+ const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+ const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+ const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+ int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+ int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+#else
+ const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+ const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+ const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+ const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+ const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+ const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+ int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+ const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+ const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+ const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+ const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+ const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+ const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+ int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+#endif
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+
+ vacc0x0123 = vcvtnq_s32_f32(vfpacc0x0123);
+ vacc0x4567 = vcvtnq_s32_f32(vfpacc0x4567);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+
+ int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+ int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+ const int8x8_t voutput_min = vld1_dup_s8(¶ms->fp32_neonv8.output_min);
+ const int8x8_t voutput_max = vld1_dup_s8(¶ms->fp32_neonv8.output_max);
+
+ vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+ vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_s8(c0 + 0, vout0x01234567);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_s8(c0, vout0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
index 8f8fc55..6a6d39a 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -186,6 +186,7 @@
const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->gemmlowp_neon.output_zero_point);
#if XNN_ARCH_ARM64
const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+
int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
#else
const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
index 583a28f..eb65582 100644
--- a/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -137,6 +137,7 @@
const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->gemmlowp_neon.output_zero_point);
#if XNN_ARCH_ARM64
const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+
int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
#else
const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
diff --git a/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
index 93342e5..537e774 100644
--- a/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -446,6 +446,7 @@
const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+
int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
#else
diff --git a/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
index 79e88dd..5ecb289 100644
--- a/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/2x16c8-minmax-gemmlowp-neon-mull-padal.c
@@ -307,6 +307,7 @@
const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+
int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
#else
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-fp32-neon-mlal-padal.c b/src/qs8-igemm/gen/2x8c8-minmax-fp32-neon-mlal-padal.c
new file mode 100644
index 0000000..26fbc33
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8c8-minmax-fp32-neon-mlal-padal.c
@@ -0,0 +1,329 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8 * sizeof(int8_t));
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ c1 = c0;
+ }
+
+ do {
+ int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc1x0 = vacc0x0;
+ int32x4_t vacc1x1 = vacc0x1;
+ int32x4_t vacc1x2 = vacc0x2;
+ int32x4_t vacc1x3 = vacc0x3;
+ int32x4_t vacc1x4 = vacc0x4;
+ int32x4_t vacc1x5 = vacc0x5;
+ int32x4_t vacc1x6 = vacc0x6;
+ int32x4_t vacc1x7 = vacc0x7;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ a += 2;
+
+ size_t k = kc;
+ // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+ while (k >= 16 * sizeof(int8_t)) {
+ const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+
+ const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+
+ const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+ int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+ vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+ vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+ vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+ vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+ const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+ int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+ vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+ vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+ vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+ vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+ const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+ int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+ vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+ vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+ vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+ vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+ const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+ int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+ vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+ vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+ vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+ vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+ const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+ int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+ vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+ vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+ vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+ vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+ const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+ int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+ vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+ vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+ vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+ vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+ const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+ int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+ vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+ vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+ vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+ vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+ const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+ int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+ vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+ vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+ vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+ vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ // Handle 8 bytes at a time using MUL.
+ if (k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+ const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+ const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+ vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+ vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+ const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+ const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+ vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+ vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+ const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+ const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+ vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+ vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+ const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+ const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+ vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+ vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+ const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+ const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+ vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+ vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+ const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+ const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+ vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+ vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+ const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+ const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+ vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+ vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+ const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+ const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+ vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+ vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ p -= 2 * sizeof(void*);
+ } while (p != 0);
+
+#if XNN_ARCH_ARM64
+ const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+ const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+ const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+ const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+ const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+ const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+ const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+ const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+ int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+ int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+ int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+ int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+#else
+ const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+ const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+ const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+ const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+ const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+ const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+ int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+ const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+ const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+ const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+ const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+ const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+ const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+ int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+ const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+ const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+ const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+ const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+ const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+ const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+ int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+ const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+ const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+ const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+ const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+ const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+ const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+ int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+#endif
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
+ float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale);
+ vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale);
+
+ const float32x4_t voutput_min_less_zero_point = vld1q_dup_f32(¶ms->fp32_neon.output_min_less_zero_point);
+ vfpacc0x0123 = vmaxq_f32(vfpacc0x0123, voutput_min_less_zero_point);
+ vfpacc0x4567 = vmaxq_f32(vfpacc0x4567, voutput_min_less_zero_point);
+ vfpacc1x0123 = vmaxq_f32(vfpacc1x0123, voutput_min_less_zero_point);
+ vfpacc1x4567 = vmaxq_f32(vfpacc1x4567, voutput_min_less_zero_point);
+
+ const float32x4_t voutput_max_less_zero_point = vld1q_dup_f32(¶ms->fp32_neon.output_max_less_zero_point);
+ vfpacc0x0123 = vminq_f32(vfpacc0x0123, voutput_max_less_zero_point);
+ vfpacc0x4567 = vminq_f32(vfpacc0x4567, voutput_max_less_zero_point);
+ vfpacc1x0123 = vminq_f32(vfpacc1x0123, voutput_max_less_zero_point);
+ vfpacc1x4567 = vminq_f32(vfpacc1x4567, voutput_max_less_zero_point);
+
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
+ vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));
+ vacc1x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x0123, vmagic_bias));
+ vacc1x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x4567, vmagic_bias));
+
+ const int32x4_t vmagic_bias_less_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_zero_point);
+ vacc0x0123 = vsubq_s32(vacc0x0123, vmagic_bias_less_zero_point);
+ vacc0x4567 = vsubq_s32(vacc0x4567, vmagic_bias_less_zero_point);
+ vacc1x0123 = vsubq_s32(vacc1x0123, vmagic_bias_less_zero_point);
+ vacc1x4567 = vsubq_s32(vacc1x4567, vmagic_bias_less_zero_point);
+
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vmovn_high_s32(vmovn_s32(vacc0x0123), vacc0x4567);
+ const int16x8_t vacc1x01234567 = vmovn_high_s32(vmovn_s32(vacc1x0123), vacc1x4567);
+
+ int8x16_t vout0x01234567_1x01234567 = vmovn_high_s16(vmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+ const int16x8_t vacc0x01234567 = vcombine_s16(vmovn_s32(vacc0x0123), vmovn_s32(vacc0x4567));
+ const int16x8_t vacc1x01234567 = vcombine_s16(vmovn_s32(vacc1x0123), vmovn_s32(vacc1x4567));
+
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vmovn_s16(vacc0x01234567), vmovn_s16(vacc1x01234567));
+#endif
+
+ if (nc >= 8) {
+ vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+ vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c b/src/qs8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c
new file mode 100644
index 0000000..b6ad403
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8c8-minmax-fp32-neonv8-mlal-padal.c
@@ -0,0 +1,317 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c8-neon-mull-padal.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8 * sizeof(int8_t));
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ c1 = c0;
+ }
+
+ do {
+ int32x4_t vacc0x0 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x1 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x2 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x3 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x4 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x5 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x6 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc0x7 = vld1q_lane_s32(w, vmovq_n_s32(0), 0); w = (const void*) ((uintptr_t) w + sizeof(int32_t));
+ int32x4_t vacc1x0 = vacc0x0;
+ int32x4_t vacc1x1 = vacc0x1;
+ int32x4_t vacc1x2 = vacc0x2;
+ int32x4_t vacc1x3 = vacc0x3;
+ int32x4_t vacc1x4 = vacc0x4;
+ int32x4_t vacc1x5 = vacc0x5;
+ int32x4_t vacc1x6 = vacc0x6;
+ int32x4_t vacc1x7 = vacc0x7;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ a += 2;
+
+ size_t k = kc;
+ // 2x partial unrolled loop to load 16 bytes at a time using MLA.
+ while (k >= 16 * sizeof(int8_t)) {
+ const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+
+ const int8x8_t vb0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb4x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb5x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb6x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ const int8x8_t vb7x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+
+ const int8x8_t vb0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x0 = vmull_s8(vb0x0, va0x0);
+ int16x8_t vprod1x0 = vmull_s8(vb0x0, va1x0);
+ vprod0x0 = vmlal_s8(vprod0x0, vb0x1, va0x1);
+ vprod1x0 = vmlal_s8(vprod1x0, vb0x1, va1x1);
+ vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+ vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+ const int8x8_t vb1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x1 = vmull_s8(vb1x0, va0x0);
+ int16x8_t vprod1x1 = vmull_s8(vb1x0, va1x0);
+ vprod0x1 = vmlal_s8(vprod0x1, vb1x1, va0x1);
+ vprod1x1 = vmlal_s8(vprod1x1, vb1x1, va1x1);
+ vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+ vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+ const int8x8_t vb2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x2 = vmull_s8(vb2x0, va0x0);
+ int16x8_t vprod1x2 = vmull_s8(vb2x0, va1x0);
+ vprod0x2 = vmlal_s8(vprod0x2, vb2x1, va0x1);
+ vprod1x2 = vmlal_s8(vprod1x2, vb2x1, va1x1);
+ vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+ vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+ const int8x8_t vb3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x3 = vmull_s8(vb3x0, va0x0);
+ int16x8_t vprod1x3 = vmull_s8(vb3x0, va1x0);
+ vprod0x3 = vmlal_s8(vprod0x3, vb3x1, va0x1);
+ vprod1x3 = vmlal_s8(vprod1x3, vb3x1, va1x1);
+ vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+ vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+ const int8x8_t vb4x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x4 = vmull_s8(vb4x0, va0x0);
+ int16x8_t vprod1x4 = vmull_s8(vb4x0, va1x0);
+ vprod0x4 = vmlal_s8(vprod0x4, vb4x1, va0x1);
+ vprod1x4 = vmlal_s8(vprod1x4, vb4x1, va1x1);
+ vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+ vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+ const int8x8_t vb5x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x5 = vmull_s8(vb5x0, va0x0);
+ int16x8_t vprod1x5 = vmull_s8(vb5x0, va1x0);
+ vprod0x5 = vmlal_s8(vprod0x5, vb5x1, va0x1);
+ vprod1x5 = vmlal_s8(vprod1x5, vb5x1, va1x1);
+ vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+ vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+ const int8x8_t vb6x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x6 = vmull_s8(vb6x0, va0x0);
+ int16x8_t vprod1x6 = vmull_s8(vb6x0, va1x0);
+ vprod0x6 = vmlal_s8(vprod0x6, vb6x1, va0x1);
+ vprod1x6 = vmlal_s8(vprod1x6, vb6x1, va1x1);
+ vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+ vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+ const int8x8_t vb7x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof( int8_t));
+ int16x8_t vprod0x7 = vmull_s8(vb7x0, va0x0);
+ int16x8_t vprod1x7 = vmull_s8(vb7x0, va1x0);
+ vprod0x7 = vmlal_s8(vprod0x7, vb7x1, va0x1);
+ vprod1x7 = vmlal_s8(vprod1x7, vb7x1, va1x1);
+ vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+ vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ // Handle 8 bytes at a time using MUL.
+ if (k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+ const int8x8_t vb0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x0 = vmull_s8(vb0, va0);
+ const int16x8_t vprod1x0 = vmull_s8(vb0, va1);
+ vacc0x0 = vpadalq_s16(vacc0x0, vprod0x0);
+ vacc1x0 = vpadalq_s16(vacc1x0, vprod1x0);
+ const int8x8_t vb1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x1 = vmull_s8(vb1, va0);
+ const int16x8_t vprod1x1 = vmull_s8(vb1, va1);
+ vacc0x1 = vpadalq_s16(vacc0x1, vprod0x1);
+ vacc1x1 = vpadalq_s16(vacc1x1, vprod1x1);
+ const int8x8_t vb2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x2 = vmull_s8(vb2, va0);
+ const int16x8_t vprod1x2 = vmull_s8(vb2, va1);
+ vacc0x2 = vpadalq_s16(vacc0x2, vprod0x2);
+ vacc1x2 = vpadalq_s16(vacc1x2, vprod1x2);
+ const int8x8_t vb3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x3 = vmull_s8(vb3, va0);
+ const int16x8_t vprod1x3 = vmull_s8(vb3, va1);
+ vacc0x3 = vpadalq_s16(vacc0x3, vprod0x3);
+ vacc1x3 = vpadalq_s16(vacc1x3, vprod1x3);
+ const int8x8_t vb4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x4 = vmull_s8(vb4, va0);
+ const int16x8_t vprod1x4 = vmull_s8(vb4, va1);
+ vacc0x4 = vpadalq_s16(vacc0x4, vprod0x4);
+ vacc1x4 = vpadalq_s16(vacc1x4, vprod1x4);
+ const int8x8_t vb5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x5 = vmull_s8(vb5, va0);
+ const int16x8_t vprod1x5 = vmull_s8(vb5, va1);
+ vacc0x5 = vpadalq_s16(vacc0x5, vprod0x5);
+ vacc1x5 = vpadalq_s16(vacc1x5, vprod1x5);
+ const int8x8_t vb6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x6 = vmull_s8(vb6, va0);
+ const int16x8_t vprod1x6 = vmull_s8(vb6, va1);
+ vacc0x6 = vpadalq_s16(vacc0x6, vprod0x6);
+ vacc1x6 = vpadalq_s16(vacc1x6, vprod1x6);
+ const int8x8_t vb7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vprod0x7 = vmull_s8(vb7, va0);
+ const int16x8_t vprod1x7 = vmull_s8(vb7, va1);
+ vacc0x7 = vpadalq_s16(vacc0x7, vprod0x7);
+ vacc1x7 = vpadalq_s16(vacc1x7, vprod1x7);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ p -= 2 * sizeof(void*);
+ } while (p != 0);
+
+#if XNN_ARCH_ARM64
+ const int32x4_t vsum0x01 = vpaddq_s32(vacc0x0, vacc0x1);
+ const int32x4_t vsum0x23 = vpaddq_s32(vacc0x2, vacc0x3);
+ const int32x4_t vsum0x45 = vpaddq_s32(vacc0x4, vacc0x5);
+ const int32x4_t vsum0x67 = vpaddq_s32(vacc0x6, vacc0x7);
+ const int32x4_t vsum1x01 = vpaddq_s32(vacc1x0, vacc1x1);
+ const int32x4_t vsum1x23 = vpaddq_s32(vacc1x2, vacc1x3);
+ const int32x4_t vsum1x45 = vpaddq_s32(vacc1x4, vacc1x5);
+ const int32x4_t vsum1x67 = vpaddq_s32(vacc1x6, vacc1x7);
+ int32x4_t vacc0x0123 = vpaddq_s32(vsum0x01, vsum0x23);
+ int32x4_t vacc0x4567 = vpaddq_s32(vsum0x45, vsum0x67);
+ int32x4_t vacc1x0123 = vpaddq_s32(vsum1x01, vsum1x23);
+ int32x4_t vacc1x4567 = vpaddq_s32(vsum1x45, vsum1x67);
+#else
+ const int32x2_t vpsum0x0 = vadd_s32(vget_low_s32(vacc0x0), vget_high_s32(vacc0x0));
+ const int32x2_t vpsum0x1 = vadd_s32(vget_low_s32(vacc0x1), vget_high_s32(vacc0x1));
+ const int32x2_t vpsum0x2 = vadd_s32(vget_low_s32(vacc0x2), vget_high_s32(vacc0x2));
+ const int32x2_t vpsum0x3 = vadd_s32(vget_low_s32(vacc0x3), vget_high_s32(vacc0x3));
+ const int32x2_t vsum0x01 = vpadd_s32(vpsum0x0, vpsum0x1);
+ const int32x2_t vsum0x23 = vpadd_s32(vpsum0x2, vpsum0x3);
+ int32x4_t vacc0x0123 = vcombine_s32(vsum0x01, vsum0x23 );
+ const int32x2_t vpsum0x4 = vadd_s32(vget_low_s32(vacc0x4), vget_high_s32(vacc0x4));
+ const int32x2_t vpsum0x5 = vadd_s32(vget_low_s32(vacc0x5), vget_high_s32(vacc0x5));
+ const int32x2_t vpsum0x6 = vadd_s32(vget_low_s32(vacc0x6), vget_high_s32(vacc0x6));
+ const int32x2_t vpsum0x7 = vadd_s32(vget_low_s32(vacc0x7), vget_high_s32(vacc0x7));
+ const int32x2_t vsum0x45 = vpadd_s32(vpsum0x4, vpsum0x5);
+ const int32x2_t vsum0x67 = vpadd_s32(vpsum0x6, vpsum0x7);
+ int32x4_t vacc0x4567 = vcombine_s32(vsum0x45, vsum0x67 );
+ const int32x2_t vpsum1x0 = vadd_s32(vget_low_s32(vacc1x0), vget_high_s32(vacc1x0));
+ const int32x2_t vpsum1x1 = vadd_s32(vget_low_s32(vacc1x1), vget_high_s32(vacc1x1));
+ const int32x2_t vpsum1x2 = vadd_s32(vget_low_s32(vacc1x2), vget_high_s32(vacc1x2));
+ const int32x2_t vpsum1x3 = vadd_s32(vget_low_s32(vacc1x3), vget_high_s32(vacc1x3));
+ const int32x2_t vsum1x01 = vpadd_s32(vpsum1x0, vpsum1x1);
+ const int32x2_t vsum1x23 = vpadd_s32(vpsum1x2, vpsum1x3);
+ int32x4_t vacc1x0123 = vcombine_s32(vsum1x01, vsum1x23 );
+ const int32x2_t vpsum1x4 = vadd_s32(vget_low_s32(vacc1x4), vget_high_s32(vacc1x4));
+ const int32x2_t vpsum1x5 = vadd_s32(vget_low_s32(vacc1x5), vget_high_s32(vacc1x5));
+ const int32x2_t vpsum1x6 = vadd_s32(vget_low_s32(vacc1x6), vget_high_s32(vacc1x6));
+ const int32x2_t vpsum1x7 = vadd_s32(vget_low_s32(vacc1x7), vget_high_s32(vacc1x7));
+ const int32x2_t vsum1x45 = vpadd_s32(vpsum1x4, vpsum1x5);
+ const int32x2_t vsum1x67 = vpadd_s32(vpsum1x6, vpsum1x7);
+ int32x4_t vacc1x4567 = vcombine_s32(vsum1x45, vsum1x67 );
+#endif
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
+ float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale);
+ vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale);
+
+ vacc0x0123 = vcvtnq_s32_f32(vfpacc0x0123);
+ vacc0x4567 = vcvtnq_s32_f32(vfpacc0x4567);
+ vacc1x0123 = vcvtnq_s32_f32(vfpacc1x0123);
+ vacc1x4567 = vcvtnq_s32_f32(vfpacc1x4567);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neonv8.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neonv8.output_max);
+
+ vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+ vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+ vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
index 0bfd2ca..dde604f 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -272,6 +272,7 @@
#if XNN_ARCH_ARM64
const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+
int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
#else
const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
index ac92e6a..ae3ec2e 100644
--- a/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -197,6 +197,7 @@
#if XNN_ARCH_ARM64
const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+
int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
#else
const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
diff --git a/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
index 5871b15..2b55042 100644
--- a/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -607,6 +607,7 @@
const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+
int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
diff --git a/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
index ea366d4..8dfc8b2 100644
--- a/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/3x16c8-minmax-gemmlowp-neon-mull-padal.c
@@ -418,6 +418,7 @@
const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+
int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
diff --git a/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
index 7e12cb2..f0a06a0 100644
--- a/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -358,6 +358,7 @@
const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+
int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
#else
diff --git a/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
index b87a0dc..79d68d2 100644
--- a/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/3x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -257,6 +257,7 @@
const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+
int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
#else
diff --git a/src/qs8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c b/src/qs8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c
new file mode 100644
index 0000000..b1b79d4
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c
@@ -0,0 +1,654 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/neon-mlal-lane.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc1x89AB = vacc0x89AB;
+ int32x4_t vacc1xCDEF = vacc0xCDEF;
+ int32x4_t vacc2x0123 = vacc0x0123;
+ int32x4_t vacc2x4567 = vacc0x4567;
+ int32x4_t vacc2x89AB = vacc0x89AB;
+ int32x4_t vacc2xCDEF = vacc0xCDEF;
+ int32x4_t vacc3x0123 = vacc0x0123;
+ int32x4_t vacc3x4567 = vacc0x4567;
+ int32x4_t vacc3x89AB = vacc0x89AB;
+ int32x4_t vacc3xCDEF = vacc0xCDEF;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const int8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ const int8_t* restrict a3 = a[3];
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ while (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int16x8_t vxa0 = vmovl_s8(va0);
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+ const int16x8_t vxa1 = vmovl_s8(va1);
+ const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+ const int16x8_t vxa2 = vmovl_s8(va2);
+ const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+ const int16x8_t vxa3 = vmovl_s8(va3);
+
+ const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+ const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+ const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+ const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+ const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+
+
+ const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+ const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+ const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+ const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+ const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int16x8_t vxa0 = vmovl_s8(va0);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+ const int16x8_t vxa1 = vmovl_s8(va1);
+ const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+ const int16x8_t vxa2 = vmovl_s8(va2);
+ const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+ const int16x8_t vxa3 = vmovl_s8(va3);
+
+ const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+ const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+
+ if (k >= 2 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+ const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+ const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+
+ if (k >= 4 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+ const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+ const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+
+ if (k >= 6 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+ const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+ const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc0x89AB = vcvtq_f32_s32(vacc0x89AB);
+ float32x4_t vfpacc0xCDEF = vcvtq_f32_s32(vacc0xCDEF);
+ float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
+ float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);
+ float32x4_t vfpacc1x89AB = vcvtq_f32_s32(vacc1x89AB);
+ float32x4_t vfpacc1xCDEF = vcvtq_f32_s32(vacc1xCDEF);
+ float32x4_t vfpacc2x0123 = vcvtq_f32_s32(vacc2x0123);
+ float32x4_t vfpacc2x4567 = vcvtq_f32_s32(vacc2x4567);
+ float32x4_t vfpacc2x89AB = vcvtq_f32_s32(vacc2x89AB);
+ float32x4_t vfpacc2xCDEF = vcvtq_f32_s32(vacc2xCDEF);
+ float32x4_t vfpacc3x0123 = vcvtq_f32_s32(vacc3x0123);
+ float32x4_t vfpacc3x4567 = vcvtq_f32_s32(vacc3x4567);
+ float32x4_t vfpacc3x89AB = vcvtq_f32_s32(vacc3x89AB);
+ float32x4_t vfpacc3xCDEF = vcvtq_f32_s32(vacc3xCDEF);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc0x89AB = vmulq_f32(vfpacc0x89AB, vscale);
+ vfpacc0xCDEF = vmulq_f32(vfpacc0xCDEF, vscale);
+ vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale);
+ vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale);
+ vfpacc1x89AB = vmulq_f32(vfpacc1x89AB, vscale);
+ vfpacc1xCDEF = vmulq_f32(vfpacc1xCDEF, vscale);
+ vfpacc2x0123 = vmulq_f32(vfpacc2x0123, vscale);
+ vfpacc2x4567 = vmulq_f32(vfpacc2x4567, vscale);
+ vfpacc2x89AB = vmulq_f32(vfpacc2x89AB, vscale);
+ vfpacc2xCDEF = vmulq_f32(vfpacc2xCDEF, vscale);
+ vfpacc3x0123 = vmulq_f32(vfpacc3x0123, vscale);
+ vfpacc3x4567 = vmulq_f32(vfpacc3x4567, vscale);
+ vfpacc3x89AB = vmulq_f32(vfpacc3x89AB, vscale);
+ vfpacc3xCDEF = vmulq_f32(vfpacc3xCDEF, vscale);
+
+ const float32x4_t voutput_min_less_zero_point = vld1q_dup_f32(¶ms->fp32_neon.output_min_less_zero_point);
+ vfpacc0x0123 = vmaxq_f32(vfpacc0x0123, voutput_min_less_zero_point);
+ vfpacc0x4567 = vmaxq_f32(vfpacc0x4567, voutput_min_less_zero_point);
+ vfpacc0x89AB = vmaxq_f32(vfpacc0x89AB, voutput_min_less_zero_point);
+ vfpacc0xCDEF = vmaxq_f32(vfpacc0xCDEF, voutput_min_less_zero_point);
+ vfpacc1x0123 = vmaxq_f32(vfpacc1x0123, voutput_min_less_zero_point);
+ vfpacc1x4567 = vmaxq_f32(vfpacc1x4567, voutput_min_less_zero_point);
+ vfpacc1x89AB = vmaxq_f32(vfpacc1x89AB, voutput_min_less_zero_point);
+ vfpacc1xCDEF = vmaxq_f32(vfpacc1xCDEF, voutput_min_less_zero_point);
+ vfpacc2x0123 = vmaxq_f32(vfpacc2x0123, voutput_min_less_zero_point);
+ vfpacc2x4567 = vmaxq_f32(vfpacc2x4567, voutput_min_less_zero_point);
+ vfpacc2x89AB = vmaxq_f32(vfpacc2x89AB, voutput_min_less_zero_point);
+ vfpacc2xCDEF = vmaxq_f32(vfpacc2xCDEF, voutput_min_less_zero_point);
+ vfpacc3x0123 = vmaxq_f32(vfpacc3x0123, voutput_min_less_zero_point);
+ vfpacc3x4567 = vmaxq_f32(vfpacc3x4567, voutput_min_less_zero_point);
+ vfpacc3x89AB = vmaxq_f32(vfpacc3x89AB, voutput_min_less_zero_point);
+ vfpacc3xCDEF = vmaxq_f32(vfpacc3xCDEF, voutput_min_less_zero_point);
+
+ const float32x4_t voutput_max_less_zero_point = vld1q_dup_f32(¶ms->fp32_neon.output_max_less_zero_point);
+ vfpacc0x0123 = vminq_f32(vfpacc0x0123, voutput_max_less_zero_point);
+ vfpacc0x4567 = vminq_f32(vfpacc0x4567, voutput_max_less_zero_point);
+ vfpacc0x89AB = vminq_f32(vfpacc0x89AB, voutput_max_less_zero_point);
+ vfpacc0xCDEF = vminq_f32(vfpacc0xCDEF, voutput_max_less_zero_point);
+ vfpacc1x0123 = vminq_f32(vfpacc1x0123, voutput_max_less_zero_point);
+ vfpacc1x4567 = vminq_f32(vfpacc1x4567, voutput_max_less_zero_point);
+ vfpacc1x89AB = vminq_f32(vfpacc1x89AB, voutput_max_less_zero_point);
+ vfpacc1xCDEF = vminq_f32(vfpacc1xCDEF, voutput_max_less_zero_point);
+ vfpacc2x0123 = vminq_f32(vfpacc2x0123, voutput_max_less_zero_point);
+ vfpacc2x4567 = vminq_f32(vfpacc2x4567, voutput_max_less_zero_point);
+ vfpacc2x89AB = vminq_f32(vfpacc2x89AB, voutput_max_less_zero_point);
+ vfpacc2xCDEF = vminq_f32(vfpacc2xCDEF, voutput_max_less_zero_point);
+ vfpacc3x0123 = vminq_f32(vfpacc3x0123, voutput_max_less_zero_point);
+ vfpacc3x4567 = vminq_f32(vfpacc3x4567, voutput_max_less_zero_point);
+ vfpacc3x89AB = vminq_f32(vfpacc3x89AB, voutput_max_less_zero_point);
+ vfpacc3xCDEF = vminq_f32(vfpacc3xCDEF, voutput_max_less_zero_point);
+
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
+ vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));
+ vacc0x89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x89AB, vmagic_bias));
+ vacc0xCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpacc0xCDEF, vmagic_bias));
+ vacc1x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x0123, vmagic_bias));
+ vacc1x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x4567, vmagic_bias));
+ vacc1x89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x89AB, vmagic_bias));
+ vacc1xCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpacc1xCDEF, vmagic_bias));
+ vacc2x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc2x0123, vmagic_bias));
+ vacc2x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc2x4567, vmagic_bias));
+ vacc2x89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc2x89AB, vmagic_bias));
+ vacc2xCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpacc2xCDEF, vmagic_bias));
+ vacc3x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc3x0123, vmagic_bias));
+ vacc3x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc3x4567, vmagic_bias));
+ vacc3x89AB = vreinterpretq_s32_f32(vaddq_f32(vfpacc3x89AB, vmagic_bias));
+ vacc3xCDEF = vreinterpretq_s32_f32(vaddq_f32(vfpacc3xCDEF, vmagic_bias));
+
+ const int32x4_t vmagic_bias_less_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_zero_point);
+ vacc0x0123 = vsubq_s32(vacc0x0123, vmagic_bias_less_zero_point);
+ vacc0x4567 = vsubq_s32(vacc0x4567, vmagic_bias_less_zero_point);
+ vacc0x89AB = vsubq_s32(vacc0x89AB, vmagic_bias_less_zero_point);
+ vacc0xCDEF = vsubq_s32(vacc0xCDEF, vmagic_bias_less_zero_point);
+ vacc1x0123 = vsubq_s32(vacc1x0123, vmagic_bias_less_zero_point);
+ vacc1x4567 = vsubq_s32(vacc1x4567, vmagic_bias_less_zero_point);
+ vacc1x89AB = vsubq_s32(vacc1x89AB, vmagic_bias_less_zero_point);
+ vacc1xCDEF = vsubq_s32(vacc1xCDEF, vmagic_bias_less_zero_point);
+ vacc2x0123 = vsubq_s32(vacc2x0123, vmagic_bias_less_zero_point);
+ vacc2x4567 = vsubq_s32(vacc2x4567, vmagic_bias_less_zero_point);
+ vacc2x89AB = vsubq_s32(vacc2x89AB, vmagic_bias_less_zero_point);
+ vacc2xCDEF = vsubq_s32(vacc2xCDEF, vmagic_bias_less_zero_point);
+ vacc3x0123 = vsubq_s32(vacc3x0123, vmagic_bias_less_zero_point);
+ vacc3x4567 = vsubq_s32(vacc3x4567, vmagic_bias_less_zero_point);
+ vacc3x89AB = vsubq_s32(vacc3x89AB, vmagic_bias_less_zero_point);
+ vacc3xCDEF = vsubq_s32(vacc3xCDEF, vmagic_bias_less_zero_point);
+
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vmovn_high_s32(vmovn_s32(vacc0x0123), vacc0x4567);
+ const int16x8_t vacc0x89ABCDEF = vmovn_high_s32(vmovn_s32(vacc0x89AB), vacc0xCDEF);
+ const int16x8_t vacc1x01234567 = vmovn_high_s32(vmovn_s32(vacc1x0123), vacc1x4567);
+ const int16x8_t vacc1x89ABCDEF = vmovn_high_s32(vmovn_s32(vacc1x89AB), vacc1xCDEF);
+ const int16x8_t vacc2x01234567 = vmovn_high_s32(vmovn_s32(vacc2x0123), vacc2x4567);
+ const int16x8_t vacc2x89ABCDEF = vmovn_high_s32(vmovn_s32(vacc2x89AB), vacc2xCDEF);
+ const int16x8_t vacc3x01234567 = vmovn_high_s32(vmovn_s32(vacc3x0123), vacc3x4567);
+ const int16x8_t vacc3x89ABCDEF = vmovn_high_s32(vmovn_s32(vacc3x89AB), vacc3xCDEF);
+
+ int8x16_t vout0x0123456789ABCDEF = vmovn_high_s16(vmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+ int8x16_t vout1x0123456789ABCDEF = vmovn_high_s16(vmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+ int8x16_t vout2x0123456789ABCDEF = vmovn_high_s16(vmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+ int8x16_t vout3x0123456789ABCDEF = vmovn_high_s16(vmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vcombine_s16(vmovn_s32(vacc0x0123), vmovn_s32(vacc0x4567));
+ const int16x8_t vacc0x89ABCDEF = vcombine_s16(vmovn_s32(vacc0x89AB), vmovn_s32(vacc0xCDEF));
+ const int16x8_t vacc1x01234567 = vcombine_s16(vmovn_s32(vacc1x0123), vmovn_s32(vacc1x4567));
+ const int16x8_t vacc1x89ABCDEF = vcombine_s16(vmovn_s32(vacc1x89AB), vmovn_s32(vacc1xCDEF));
+ const int16x8_t vacc2x01234567 = vcombine_s16(vmovn_s32(vacc2x0123), vmovn_s32(vacc2x4567));
+ const int16x8_t vacc2x89ABCDEF = vcombine_s16(vmovn_s32(vacc2x89AB), vmovn_s32(vacc2xCDEF));
+ const int16x8_t vacc3x01234567 = vcombine_s16(vmovn_s32(vacc3x0123), vmovn_s32(vacc3x4567));
+ const int16x8_t vacc3x89ABCDEF = vcombine_s16(vmovn_s32(vacc3x89AB), vmovn_s32(vacc3xCDEF));
+
+ int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vmovn_s16(vacc0x01234567), vmovn_s16(vacc0x89ABCDEF));
+ int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vmovn_s16(vacc1x01234567), vmovn_s16(vacc1x89ABCDEF));
+ int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vmovn_s16(vacc2x01234567), vmovn_s16(vacc2x89ABCDEF));
+ int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vmovn_s16(vacc3x01234567), vmovn_s16(vacc3x89ABCDEF));
+#endif
+
+ if (nc >= 16) {
+ vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+ vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+ vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+ vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+ int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+ if (nc & 8) {
+ vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+ vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+ vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+ vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+ vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+ vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+ }
+ if (nc & 4) {
+ vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+ vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c b/src/qs8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
new file mode 100644
index 0000000..0f4e94e
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16-minmax-fp32-neonv8-mlal-lane.c
@@ -0,0 +1,612 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/neon-mlal-lane.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc1x89AB = vacc0x89AB;
+ int32x4_t vacc1xCDEF = vacc0xCDEF;
+ int32x4_t vacc2x0123 = vacc0x0123;
+ int32x4_t vacc2x4567 = vacc0x4567;
+ int32x4_t vacc2x89AB = vacc0x89AB;
+ int32x4_t vacc2xCDEF = vacc0xCDEF;
+ int32x4_t vacc3x0123 = vacc0x0123;
+ int32x4_t vacc3x4567 = vacc0x4567;
+ int32x4_t vacc3x89AB = vacc0x89AB;
+ int32x4_t vacc3xCDEF = vacc0xCDEF;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const int8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ const int8_t* restrict a3 = a[3];
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ while (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int16x8_t vxa0 = vmovl_s8(va0);
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+ const int16x8_t vxa1 = vmovl_s8(va1);
+ const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+ const int16x8_t vxa2 = vmovl_s8(va2);
+ const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+ const int16x8_t vxa3 = vmovl_s8(va3);
+
+ const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+ const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+ const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+ const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+ const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+
+
+ const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+ const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+ const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+ const int8x8_t vb01234567c7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c7 = vmovl_s8(vb01234567c7);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+ const int8x8_t vb89ABCDEFc7 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc7 = vmovl_s8(vb89ABCDEFc7);
+
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa0), 3);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa1), 3);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa2), 3);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc7), vget_high_s16(vxa3), 3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int16x8_t vxa0 = vmovl_s8(va0);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+ const int16x8_t vxa1 = vmovl_s8(va1);
+ const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+ const int16x8_t vxa2 = vmovl_s8(va2);
+ const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+ const int16x8_t vxa3 = vmovl_s8(va3);
+
+ const int8x8_t vb01234567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c0 = vmovl_s8(vb01234567c0);
+ const int8x8_t vb89ABCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc0 = vmovl_s8(vb89ABCDEFc0);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc0), vget_low_s16(vxa3), 0);
+
+ if (k >= 2 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c1 = vmovl_s8(vb01234567c1);
+ const int8x8_t vb89ABCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc1 = vmovl_s8(vb89ABCDEFc1);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc1), vget_low_s16(vxa3), 1);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c2 = vmovl_s8(vb01234567c2);
+ const int8x8_t vb89ABCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc2 = vmovl_s8(vb89ABCDEFc2);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc2), vget_low_s16(vxa3), 2);
+
+ if (k >= 4 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c3 = vmovl_s8(vb01234567c3);
+ const int8x8_t vb89ABCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc3 = vmovl_s8(vb89ABCDEFc3);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa0), 3);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa1), 3);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa2), 3);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc3), vget_low_s16(vxa3), 3);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c4 = vmovl_s8(vb01234567c4);
+ const int8x8_t vb89ABCDEFc4 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc4 = vmovl_s8(vb89ABCDEFc4);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc4), vget_high_s16(vxa3), 0);
+
+ if (k >= 6 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c5 = vmovl_s8(vb01234567c5);
+ const int8x8_t vb89ABCDEFc5 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc5 = vmovl_s8(vb89ABCDEFc5);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc5), vget_high_s16(vxa3), 1);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb01234567c6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb01234567c6 = vmovl_s8(vb01234567c6);
+ const int8x8_t vb89ABCDEFc6 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int16x8_t vxb89ABCDEFc6 = vmovl_s8(vb89ABCDEFc6);
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x89AB = vmlal_lane_s16(vacc0x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ vacc0xCDEF = vmlal_lane_s16(vacc0xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc1x89AB = vmlal_lane_s16(vacc1x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+ vacc1xCDEF = vmlal_lane_s16(vacc1xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc2x89AB = vmlal_lane_s16(vacc2x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+ vacc2xCDEF = vmlal_lane_s16(vacc2xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ vacc3x89AB = vmlal_lane_s16(vacc3x89AB, vget_low_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+ vacc3xCDEF = vmlal_lane_s16(vacc3xCDEF, vget_high_s16(vxb89ABCDEFc6), vget_high_s16(vxa3), 2);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc0x89AB = vcvtq_f32_s32(vacc0x89AB);
+ float32x4_t vfpacc0xCDEF = vcvtq_f32_s32(vacc0xCDEF);
+ float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
+ float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);
+ float32x4_t vfpacc1x89AB = vcvtq_f32_s32(vacc1x89AB);
+ float32x4_t vfpacc1xCDEF = vcvtq_f32_s32(vacc1xCDEF);
+ float32x4_t vfpacc2x0123 = vcvtq_f32_s32(vacc2x0123);
+ float32x4_t vfpacc2x4567 = vcvtq_f32_s32(vacc2x4567);
+ float32x4_t vfpacc2x89AB = vcvtq_f32_s32(vacc2x89AB);
+ float32x4_t vfpacc2xCDEF = vcvtq_f32_s32(vacc2xCDEF);
+ float32x4_t vfpacc3x0123 = vcvtq_f32_s32(vacc3x0123);
+ float32x4_t vfpacc3x4567 = vcvtq_f32_s32(vacc3x4567);
+ float32x4_t vfpacc3x89AB = vcvtq_f32_s32(vacc3x89AB);
+ float32x4_t vfpacc3xCDEF = vcvtq_f32_s32(vacc3xCDEF);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc0x89AB = vmulq_f32(vfpacc0x89AB, vscale);
+ vfpacc0xCDEF = vmulq_f32(vfpacc0xCDEF, vscale);
+ vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale);
+ vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale);
+ vfpacc1x89AB = vmulq_f32(vfpacc1x89AB, vscale);
+ vfpacc1xCDEF = vmulq_f32(vfpacc1xCDEF, vscale);
+ vfpacc2x0123 = vmulq_f32(vfpacc2x0123, vscale);
+ vfpacc2x4567 = vmulq_f32(vfpacc2x4567, vscale);
+ vfpacc2x89AB = vmulq_f32(vfpacc2x89AB, vscale);
+ vfpacc2xCDEF = vmulq_f32(vfpacc2xCDEF, vscale);
+ vfpacc3x0123 = vmulq_f32(vfpacc3x0123, vscale);
+ vfpacc3x4567 = vmulq_f32(vfpacc3x4567, vscale);
+ vfpacc3x89AB = vmulq_f32(vfpacc3x89AB, vscale);
+ vfpacc3xCDEF = vmulq_f32(vfpacc3xCDEF, vscale);
+
+ vacc0x0123 = vcvtnq_s32_f32(vfpacc0x0123);
+ vacc0x4567 = vcvtnq_s32_f32(vfpacc0x4567);
+ vacc0x89AB = vcvtnq_s32_f32(vfpacc0x89AB);
+ vacc0xCDEF = vcvtnq_s32_f32(vfpacc0xCDEF);
+ vacc1x0123 = vcvtnq_s32_f32(vfpacc1x0123);
+ vacc1x4567 = vcvtnq_s32_f32(vfpacc1x4567);
+ vacc1x89AB = vcvtnq_s32_f32(vfpacc1x89AB);
+ vacc1xCDEF = vcvtnq_s32_f32(vfpacc1xCDEF);
+ vacc2x0123 = vcvtnq_s32_f32(vfpacc2x0123);
+ vacc2x4567 = vcvtnq_s32_f32(vfpacc2x4567);
+ vacc2x89AB = vcvtnq_s32_f32(vfpacc2x89AB);
+ vacc2xCDEF = vcvtnq_s32_f32(vfpacc2xCDEF);
+ vacc3x0123 = vcvtnq_s32_f32(vfpacc3x0123);
+ vacc3x4567 = vcvtnq_s32_f32(vfpacc3x4567);
+ vacc3x89AB = vcvtnq_s32_f32(vfpacc3x89AB);
+ vacc3xCDEF = vcvtnq_s32_f32(vfpacc3xCDEF);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+ const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+ int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+ int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+ int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+ const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+ int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+ int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+ int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->fp32_neonv8.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->fp32_neonv8.output_max);
+
+ vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+ vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+ vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+ vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+ vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+ vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+ vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+ vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+ vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+ vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+ vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+ int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+ if (nc & 8) {
+ vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+ vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+ vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+ vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+ vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+ vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+ }
+ if (nc & 4) {
+ vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+ vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
index 834df49..576879e 100644
--- a/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -768,6 +768,7 @@
const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
diff --git a/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
index 2a87b24..79b1b20 100644
--- a/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/4x16c8-minmax-gemmlowp-neon-mull-padal.c
@@ -529,6 +529,7 @@
const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
diff --git a/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c b/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
index faeabc3..552ee75 100644
--- a/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
+++ b/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mlal-padal.c
@@ -444,6 +444,7 @@
const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+
int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
#else
diff --git a/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c b/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
index 1b9a4ec..f0aa950 100644
--- a/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
+++ b/src/qs8-igemm/gen/4x8c8-minmax-gemmlowp-neon-mull-padal.c
@@ -317,6 +317,7 @@
const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+
int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
#else
diff --git a/src/qs8-igemm/neon-mlal-lane.c.in b/src/qs8-igemm/neon-mlal-lane.c.in
index 5527bc2..6e9c968 100644
--- a/src/qs8-igemm/neon-mlal-lane.c.in
+++ b/src/qs8-igemm/neon-mlal-lane.c.in
@@ -6,6 +6,7 @@
$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
$assert NR % 8 == 0
$assert 8 <= NR <= 16
+$assert REQUANTIZATION in ["GEMMLOWP", "FP32"]
#include <assert.h>
#include <arm_neon.h>
@@ -14,7 +15,9 @@
#include <xnnpack/igemm.h>
-void xnn_qs8_igemm_minmax_gemmlowp_ukernel_${MR}x${NR}__neon_mlal_lane${"_prfm" if PREFETCH else ""}(
+$PARAMS_STRUCT = REQUANTIZATION.lower() + "_" + ("neonv8" if ARMV8 else "neon")
+$ISA = "neonv8" if ARMV8 else "neon"
+void xnn_qs8_igemm_minmax_${REQUANTIZATION.lower()}_ukernel_${MR}x${NR}__${ISA}_mlal_lane${"_prfm" if PREFETCH else ""}(
size_t mr,
size_t nc,
size_t kc,
@@ -184,79 +187,144 @@
p -= ${MR} * sizeof(void*);
} while (p != 0);
- const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->gemmlowp_neon.multiplier);
- $for M in range(MR):
- $for N in range(0, NR, 4):
- vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
+ $if REQUANTIZATION == "GEMMLOWP":
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->${PARAMS_STRUCT}.multiplier);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
- const int32x4_t vright_shift = vld1q_dup_s32(¶ms->gemmlowp_neon.right_shift);
- const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
- $for M in range(MR):
- $for N in range(0, NR, 4):
- vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->${PARAMS_STRUCT}.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
- $for M in range(MR):
- $for N in range(0, NR, 4):
- vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+ $elif REQUANTIZATION == "FP32":
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ float32x4_t vfpacc${M}x${ABC[N:N+4]} = vcvtq_f32_s32(vacc${M}x${ABC[N:N+4]});
- const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->gemmlowp_neon.output_zero_point);
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.scale);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vfpacc${M}x${ABC[N:N+4]} = vmulq_f32(vfpacc${M}x${ABC[N:N+4]}, vscale);
+
+ $if ARMV8:
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vcvtnq_s32_f32(vfpacc${M}x${ABC[N:N+4]});
+ $else:
+ const float32x4_t voutput_min_less_zero_point = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.output_min_less_zero_point);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vfpacc${M}x${ABC[N:N+4]} = vmaxq_f32(vfpacc${M}x${ABC[N:N+4]}, voutput_min_less_zero_point);
+
+ const float32x4_t voutput_max_less_zero_point = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.output_max_less_zero_point);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vfpacc${M}x${ABC[N:N+4]} = vminq_f32(vfpacc${M}x${ABC[N:N+4]}, voutput_max_less_zero_point);
+
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->${PARAMS_STRUCT}.magic_bias);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vreinterpretq_s32_f32(vaddq_f32(vfpacc${M}x${ABC[N:N+4]}, vmagic_bias));
+
+ const int32x4_t vmagic_bias_less_zero_point = vld1q_dup_s32(¶ms->${PARAMS_STRUCT}.magic_bias_less_zero_point);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vsubq_s32(vacc${M}x${ABC[N:N+4]}, vmagic_bias_less_zero_point);
+
+ $if REQUANTIZATION != "FP32" or ARMV8:
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->${PARAMS_STRUCT}.output_zero_point);
#if XNN_ARCH_ARM64
- $for M in range(MR):
- $for N in range(0, NR, 8):
- const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
+ $if REQUANTIZATION == "FP32" and not ARMV8:
+ $for M in range(MR):
+ $for N in range(0, NR, 8):
+ const int16x8_t vacc${M}x${ABC[N:N+8]} = vmovn_high_s32(vmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]});
- $for M in range(MR):
- $for N in range(0, NR, 16):
- $if N + 8 < NR:
- int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
- $elif M % 2 == 1:
- int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
- $elif M + 1 == MR:
- int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
-#else
- $for M in range(MR):
- $for N in range(0, NR, 8):
- const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
-
- $for M in range(MR):
- $for N in range(0, NR, 16):
- $if N + 8 < NR:
- int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
- $elif M % 2 == 1:
- int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
- $elif M + 1 == MR:
- int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
-#endif
- $if NR == 8 and MR == 1:
- const int8x8_t voutput_min = vld1_dup_s8(¶ms->gemmlowp_neon.output_min);
- const int8x8_t voutput_max = vld1_dup_s8(¶ms->gemmlowp_neon.output_max);
+ $for M in range(MR):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ int8x16_t vout${M}x${ABC[N:N+16]} = vmovn_high_s16(vmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+ $elif M % 2 == 1:
+ int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmovn_high_s16(vmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+ $elif M + 1 == MR:
+ int8x8_t vout${M}x${ABC[N:N+8]} = vmovn_s16(vacc${M}x${ABC[N:N+8]});
$else:
- const int8x16_t voutput_min = vld1q_dup_s8(¶ms->gemmlowp_neon.output_min);
- const int8x16_t voutput_max = vld1q_dup_s8(¶ms->gemmlowp_neon.output_max);
+ $for M in range(MR):
+ $for N in range(0, NR, 8):
+ const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
- $for M in reversed(range(MR)):
- $for N in range(0, NR, 16):
- $if N + 8 < NR:
- vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
- $elif M % 2 == 1:
- vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
- $elif M + 1 == MR:
- $if NR == 8 and MR == 1:
- vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
- $else:
- vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+ $for M in range(MR):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+ $elif M % 2 == 1:
+ int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+ $elif M + 1 == MR:
+ int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#else
+ $if REQUANTIZATION == "FP32" and not ARMV8:
+ $for M in range(MR):
+ $for N in range(0, NR, 8):
+ const int16x8_t vacc${M}x${ABC[N:N+8]} = vcombine_s16(vmovn_s32(vacc${M}x${ABC[N:N+4]}), vmovn_s32(vacc${M}x${ABC[N+4:N+8]}));
- $for M in reversed(range(MR)):
- $for N in range(0, NR, 16):
- $if N + 8 < NR:
- vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
- $elif M % 2 == 1:
- vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
- $elif M + 1 == MR:
- $if NR == 8 and MR == 1:
- vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
- $else:
- vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
+ $for M in range(MR):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vmovn_s16(vacc${M}x${ABC[N:N+8]}), vmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+ $elif M % 2 == 1:
+ int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vmovn_s16(vacc${M}x${ABC[N:N+8]}));
+ $elif M + 1 == MR:
+ int8x8_t vout${M}x${ABC[N:N+8]} = vmovn_s16(vacc${M}x${ABC[N:N+8]});
+ $else:
+ $for M in range(MR):
+ $for N in range(0, NR, 8):
+ const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
+
+ $for M in range(MR):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+ $elif M % 2 == 1:
+ int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
+ $elif M + 1 == MR:
+ int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#endif
+ $if REQUANTIZATION != "FP32" or ARMV8:
+ $if NR == 8 and MR == 1:
+ const int8x8_t voutput_min = vld1_dup_s8(¶ms->${PARAMS_STRUCT}.output_min);
+ const int8x8_t voutput_max = vld1_dup_s8(¶ms->${PARAMS_STRUCT}.output_max);
+ $else:
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->${PARAMS_STRUCT}.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->${PARAMS_STRUCT}.output_max);
+
+ $for M in reversed(range(MR)):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
+ $elif M % 2 == 1:
+ vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
+ $elif M + 1 == MR:
+ $if NR == 8 and MR == 1:
+ vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
+ $else:
+ vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+
+ $for M in reversed(range(MR)):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
+ $elif M % 2 == 1:
+ vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
+ $elif M + 1 == MR:
+ $if NR == 8 and MR == 1:
+ vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
+ $else:
+ vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
if (nc >= ${NR}) {
$for M in reversed(range(MR)):
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 5ae347a..84d077f 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -342,6 +342,12 @@
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x16__neon_mlal_lane)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_6x16__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane)
+
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x8__neon_mlal_lane_prfm)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x8__neon_mlal_lane_prfm)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8__neon_mlal_lane_prfm)
@@ -378,6 +384,12 @@
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x8c8__neon_mlal_padal)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_4x8c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal)
+
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_1x16c8__neon_mlal_padal)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_2x16c8__neon_mlal_padal)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_gemmlowp_ukernel_3x16c8__neon_mlal_padal)
diff --git a/test/qs8-igemm-minmax-fp32.cc b/test/qs8-igemm-minmax-fp32.cc
index e3d97d1..1ebaf9c 100644
--- a/test/qs8-igemm-minmax-fp32.cc
+++ b/test/qs8-igemm-minmax-fp32.cc
@@ -22,6 +22,3750 @@
#include "gemm-microkernel-tester.h"
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(43)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 1; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(43)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEON_MLAL_LANE, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, a_offset) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(43)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, zero) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t mz = 0; mz < 1; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(43)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, qmin) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, qmax) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X16__NEONV8_MLAL_LANE, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, a_offset) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, zero) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, qmin) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, qmax) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_4X16__NEONV8_MLAL_LANE, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 1; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEON_MLAL_PADAL, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 2; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEON_MLAL_PADAL, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neon_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_div_16) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, a_offset) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, zero) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t mz = 0; mz < 1; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, qmin) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, qmax) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_1X8C8__NEONV8_MLAL_PADAL, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_div_16) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, a_offset) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, zero) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ for (uint32_t mz = 0; mz < 2; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, qmin) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, qmax) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_FP32_2X8C8__NEONV8_MLAL_PADAL, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_V8;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal, xnn_init_qs8_conv_minmax_fp32_neonv8_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
TEST(QS8_IGEMM_MINMAX_FP32_1X8C4__NEONDOT, k_eq_8) {
TEST_REQUIRES_ARM_NEON_DOT;
diff --git a/test/qs8-igemm-minmax-fp32.yaml b/test/qs8-igemm-minmax-fp32.yaml
index 8f337f5..5392210 100644
--- a/test/qs8-igemm-minmax-fp32.yaml
+++ b/test/qs8-igemm-minmax-fp32.yaml
@@ -3,6 +3,30 @@
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane
+ init: xnn_init_qs8_conv_minmax_fp32_neon_params
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane
+ init: xnn_init_qs8_conv_minmax_fp32_neon_params
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x16__neonv8_mlal_lane
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neon_mlal_padal
+ init: xnn_init_qs8_conv_minmax_fp32_neon_params
+ k-block: 16
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neon_mlal_padal
+ init: xnn_init_qs8_conv_minmax_fp32_neon_params
+ k-block: 16
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c8__neonv8_mlal_padal
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 16
+- name: xnn_qs8_igemm_minmax_fp32_ukernel_2x8c8__neonv8_mlal_padal
+ init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
+ k-block: 16
- name: xnn_qs8_igemm_minmax_fp32_ukernel_1x8c4__neondot
init: xnn_init_qs8_conv_minmax_fp32_neonv8_params
k-block: 8