Merge pull request #2036 from digantdesai:enable_fp32_arm_kernels
PiperOrigin-RevId: 415221521
diff --git a/BUILD.bazel b/BUILD.bazel
index 0646691..7166f42 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3134,15 +3134,19 @@
"src/qu8-dwconv/gen/up32x25-minmax-rndnu-neon-mul16.c",
"src/qu8-gavgpool/7p7x-minmax-neon-c8.c",
"src/qu8-gavgpool/7x-minmax-neon-c8.c",
+ "src/qu8-gemm/gen/1x8-minmax-fp32-neon-mlal-lane.c",
"src/qu8-gemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c",
"src/qu8-gemm/gen/1x16-minmax-fp32-neon-mlal-lane.c",
"src/qu8-gemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c",
+ "src/qu8-gemm/gen/4x8-minmax-fp32-neon-mlal-lane.c",
"src/qu8-gemm/gen/4x8-minmax-rndnu-neon-mlal-lane.c",
"src/qu8-gemm/gen/4x16-minmax-fp32-neon-mlal-lane.c",
"src/qu8-gemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c",
+ "src/qu8-igemm/gen/1x8-minmax-fp32-neon-mlal-lane.c",
"src/qu8-igemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c",
"src/qu8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c",
"src/qu8-igemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c",
+ "src/qu8-igemm/gen/4x8-minmax-fp32-neon-mlal-lane.c",
"src/qu8-igemm/gen/4x8-minmax-rndnu-neon-mlal-lane.c",
"src/qu8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c",
"src/qu8-igemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c",
@@ -3949,15 +3953,18 @@
"src/qs8-igemm/gen/8x8c4-minmax-rndnu-neondot.c",
"src/qs8-igemm/gen/8x16c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/1x8c4-minmax-rndnu-neondot.c",
+ "src/qu8-gemm/gen/1x16c4-minmax-fp32-neondot.c",
"src/qu8-gemm/gen/1x16c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/1x32c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/2x8c4-minmax-rndnu-neondot.c",
+ "src/qu8-gemm/gen/2x16c4-minmax-fp32-neondot.c",
"src/qu8-gemm/gen/2x16c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/2x32c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/3x8c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/3x16c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/3x32c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/4x8c4-minmax-rndnu-neondot.c",
+ "src/qu8-gemm/gen/4x16c4-minmax-fp32-neondot.c",
"src/qu8-gemm/gen/4x16c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/5x8c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/5x16c4-minmax-rndnu-neondot.c",
@@ -3966,14 +3973,17 @@
"src/qu8-gemm/gen/8x8c4-minmax-rndnu-neondot.c",
"src/qu8-gemm/gen/8x16c4-minmax-rndnu-neondot.c",
"src/qu8-igemm/gen/1x8c4-minmax-rndnu-neondot.c",
+ "src/qu8-igemm/gen/1x16c4-minmax-fp32-neondot.c",
"src/qu8-igemm/gen/1x16c4-minmax-rndnu-neondot.c",
"src/qu8-igemm/gen/1x32c4-minmax-rndnu-neondot.c",
"src/qu8-igemm/gen/2x8c4-minmax-rndnu-neondot.c",
+ "src/qu8-igemm/gen/2x16c4-minmax-fp32-neondot.c",
"src/qu8-igemm/gen/2x16c4-minmax-rndnu-neondot.c",
"src/qu8-igemm/gen/2x32c4-minmax-rndnu-neondot.c",
"src/qu8-igemm/gen/3x8c4-minmax-rndnu-neondot.c",
"src/qu8-igemm/gen/3x16c4-minmax-rndnu-neondot.c",
"src/qu8-igemm/gen/3x32c4-minmax-rndnu-neondot.c",
+ "src/qu8-igemm/gen/4x16c4-minmax-fp32-neondot.c",
"src/qu8-igemm/gen/4x8c4-minmax-rndnu-neondot.c",
"src/qu8-igemm/gen/4x16c4-minmax-rndnu-neondot.c",
"src/qu8-igemm/gen/5x8c4-minmax-rndnu-neondot.c",
@@ -6434,6 +6444,8 @@
"src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
"src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S",
"src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
+ "src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
+ "src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
"src/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
"src/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S",
"src/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
@@ -6444,6 +6456,8 @@
"src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S",
"src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S",
"src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S",
+ "src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S",
+ "src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S",
"src/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S",
"src/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S",
]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d704892..f7ccfae 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2100,15 +2100,19 @@
src/qu8-dwconv/gen/up32x25-minmax-rndnu-neon-mul16.c
src/qu8-gavgpool/7p7x-minmax-neon-c8.c
src/qu8-gavgpool/7x-minmax-neon-c8.c
+ src/qu8-gemm/gen/1x8-minmax-fp32-neon-mlal-lane.c
src/qu8-gemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c
src/qu8-gemm/gen/1x16-minmax-fp32-neon-mlal-lane.c
src/qu8-gemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c
+ src/qu8-gemm/gen/4x8-minmax-fp32-neon-mlal-lane.c
src/qu8-gemm/gen/4x8-minmax-rndnu-neon-mlal-lane.c
src/qu8-gemm/gen/4x16-minmax-fp32-neon-mlal-lane.c
src/qu8-gemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c
+ src/qu8-igemm/gen/1x8-minmax-fp32-neon-mlal-lane.c
src/qu8-igemm/gen/1x8-minmax-rndnu-neon-mlal-lane.c
src/qu8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c
src/qu8-igemm/gen/1x16-minmax-rndnu-neon-mlal-lane.c
+ src/qu8-igemm/gen/4x8-minmax-fp32-neon-mlal-lane.c
src/qu8-igemm/gen/4x8-minmax-rndnu-neon-mlal-lane.c
src/qu8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c
src/qu8-igemm/gen/4x16-minmax-rndnu-neon-mlal-lane.c
@@ -2903,15 +2907,18 @@
src/qs8-igemm/gen/8x8c4-minmax-rndnu-neondot.c
src/qs8-igemm/gen/8x16c4-minmax-rndnu-neondot.c
src/qu8-gemm/gen/1x8c4-minmax-rndnu-neondot.c
+ src/qu8-gemm/gen/1x16c4-minmax-fp32-neondot.c
src/qu8-gemm/gen/1x16c4-minmax-rndnu-neondot.c
src/qu8-gemm/gen/1x32c4-minmax-rndnu-neondot.c
src/qu8-gemm/gen/2x8c4-minmax-rndnu-neondot.c
+ src/qu8-gemm/gen/2x16c4-minmax-fp32-neondot.c
src/qu8-gemm/gen/2x16c4-minmax-rndnu-neondot.c
src/qu8-gemm/gen/2x32c4-minmax-rndnu-neondot.c
src/qu8-gemm/gen/3x8c4-minmax-rndnu-neondot.c
src/qu8-gemm/gen/3x16c4-minmax-rndnu-neondot.c
src/qu8-gemm/gen/3x32c4-minmax-rndnu-neondot.c
src/qu8-gemm/gen/4x8c4-minmax-rndnu-neondot.c
+ src/qu8-gemm/gen/4x16c4-minmax-fp32-neondot.c
src/qu8-gemm/gen/4x16c4-minmax-rndnu-neondot.c
src/qu8-gemm/gen/5x8c4-minmax-rndnu-neondot.c
src/qu8-gemm/gen/5x16c4-minmax-rndnu-neondot.c
@@ -2920,15 +2927,18 @@
src/qu8-gemm/gen/8x8c4-minmax-rndnu-neondot.c
src/qu8-gemm/gen/8x16c4-minmax-rndnu-neondot.c
src/qu8-igemm/gen/1x8c4-minmax-rndnu-neondot.c
+ src/qu8-igemm/gen/1x16c4-minmax-fp32-neondot.c
src/qu8-igemm/gen/1x16c4-minmax-rndnu-neondot.c
src/qu8-igemm/gen/1x32c4-minmax-rndnu-neondot.c
src/qu8-igemm/gen/2x8c4-minmax-rndnu-neondot.c
+ src/qu8-igemm/gen/2x16c4-minmax-fp32-neondot.c
src/qu8-igemm/gen/2x16c4-minmax-rndnu-neondot.c
src/qu8-igemm/gen/2x32c4-minmax-rndnu-neondot.c
src/qu8-igemm/gen/3x8c4-minmax-rndnu-neondot.c
src/qu8-igemm/gen/3x16c4-minmax-rndnu-neondot.c
src/qu8-igemm/gen/3x32c4-minmax-rndnu-neondot.c
src/qu8-igemm/gen/4x8c4-minmax-rndnu-neondot.c
+ src/qu8-igemm/gen/4x16c4-minmax-fp32-neondot.c
src/qu8-igemm/gen/4x16c4-minmax-rndnu-neondot.c
src/qu8-igemm/gen/5x8c4-minmax-rndnu-neondot.c
src/qu8-igemm/gen/5x16c4-minmax-rndnu-neondot.c
@@ -5358,6 +5368,8 @@
src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S
src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S
src/qu8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
+ src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
+ src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
src/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S
src/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S
src/qu8-igemm/gen/4x8c4-minmax-rndnu-aarch64-neondot-cortex-a55.S
@@ -5368,6 +5380,8 @@
src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a53.S
src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-cortex-a75.S
src/qu8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-prfm-ld64.S
+ src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
+ src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
src/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S
src/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S)
diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh
index 95bcc37..bb2d04a 100755
--- a/scripts/generate-qs8-gemm.sh
+++ b/scripts/generate-qs8-gemm.sh
@@ -219,6 +219,8 @@
tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=1 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gemm/gen/1x16-minmax-fp32-neon-mlal-lane.c &
tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=4 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-gemm/gen/4x16-minmax-fp32-neon-mlal-lane.c &
+tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=1 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gemm/gen/1x8-minmax-fp32-neon-mlal-lane.c &
+tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=4 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gemm/gen/4x8-minmax-fp32-neon-mlal-lane.c &
tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=1 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gemm/gen/1x16-minmax-fp32-neon-mlal-lane.c &
tools/xngen src/qs8-gemm/neon-mlal-lane.c.in -D MR=4 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-gemm/gen/4x16-minmax-fp32-neon-mlal-lane.c &
@@ -612,6 +614,10 @@
tools/xngen src/qu8-gemm/c4-neondot.c.in -D MR=2 -D NR=32 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -o src/qu8-gemm/gen/2x32c4-minmax-rndnu-neondot.c &
tools/xngen src/qu8-gemm/c4-neondot.c.in -D MR=3 -D NR=32 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -o src/qu8-gemm/gen/3x32c4-minmax-rndnu-neondot.c &
+tools/xngen src/qu8-gemm/c4-neondot.c.in -D MR=1 -D NR=16 -D REQUANTIZATION=FP32 -D CHANNELWISE=0 -o src/qu8-gemm/gen/1x16c4-minmax-fp32-neondot.c &
+tools/xngen src/qu8-gemm/c4-neondot.c.in -D MR=2 -D NR=16 -D REQUANTIZATION=FP32 -D CHANNELWISE=0 -o src/qu8-gemm/gen/2x16c4-minmax-fp32-neondot.c &
+tools/xngen src/qu8-gemm/c4-neondot.c.in -D MR=4 -D NR=16 -D REQUANTIZATION=FP32 -D CHANNELWISE=0 -o src/qu8-gemm/gen/4x16c4-minmax-fp32-neondot.c &
+
############################### AArch64 assembly ##############################
### Cortex-A53 lane micro-kernels
tools/xngen src/qs8-gemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-gemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
@@ -670,6 +676,9 @@
tools/xngen src/qu8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=RNDNU -o src/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S &
tools/xngen src/qu8-gemm/4x16c4-aarch64-neondot-ld128.S.in -D REQUANTIZATION=RNDNU -o src/qu8-gemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S &
+tools/xngen src/qu8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=FP32 -o src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S &
+tools/xngen src/qu8-gemm/4x16c4-aarch64-neondot-ld128.S.in -D REQUANTIZATION=FP32 -o src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S &
+
### C8 / C16 micro-kernels
tools/xngen src/qs8-gemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -o src/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S &
tools/xngen src/qs8-gemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -o src/qs8-gemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S &
diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh
index 83eb7a3..a6ace3c 100755
--- a/scripts/generate-qs8-igemm.sh
+++ b/scripts/generate-qs8-igemm.sh
@@ -205,6 +205,8 @@
tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c &
tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D DATATYPE=QS8 -D ARMV8=0 -o src/qs8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c &
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-igemm/gen/1x8-minmax-fp32-neon-mlal-lane.c &
+tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=8 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-igemm/gen/4x8-minmax-fp32-neon-mlal-lane.c &
tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=1 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c &
tools/xngen src/qs8-igemm/neon-mlal-lane.c.in -D MR=4 -D NR=16 -D PREFETCH=0 -D REQUANTIZATION=FP32 -D DATATYPE=QU8 -D ARMV8=0 -o src/qu8-igemm/gen/4x16-minmax-fp32-neon-mlal-lane.c &
@@ -599,6 +601,10 @@
tools/xngen src/qu8-igemm/c4-neondot.c.in -D MR=2 -D NR=32 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -o src/qu8-igemm/gen/2x32c4-minmax-rndnu-neondot.c &
tools/xngen src/qu8-igemm/c4-neondot.c.in -D MR=3 -D NR=32 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -o src/qu8-igemm/gen/3x32c4-minmax-rndnu-neondot.c &
+tools/xngen src/qu8-igemm/c4-neondot.c.in -D MR=1 -D NR=16 -D REQUANTIZATION=FP32 -D CHANNELWISE=0 -o src/qu8-igemm/gen/1x16c4-minmax-fp32-neondot.c &
+tools/xngen src/qu8-igemm/c4-neondot.c.in -D MR=2 -D NR=16 -D REQUANTIZATION=FP32 -D CHANNELWISE=0 -o src/qu8-igemm/gen/2x16c4-minmax-fp32-neondot.c &
+tools/xngen src/qu8-igemm/c4-neondot.c.in -D MR=4 -D NR=16 -D REQUANTIZATION=FP32 -D CHANNELWISE=0 -o src/qu8-igemm/gen/4x16c4-minmax-fp32-neondot.c &
+
############################### AArch64 assembly ##############################
### Cortex-A53 lane micro-kernels
tools/xngen src/qs8-igemm/4x16-aarch64-neon-mlal-lane-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -D DATATYPE=QS8 -o src/qs8-igemm/gen/4x16-minmax-rndnu-aarch64-neon-mlal-lane-cortex-a53.S &
@@ -649,6 +655,9 @@
tools/xngen src/qu8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=RNDNU -o src/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-cortex-a55.S &
tools/xngen src/qu8-igemm/4x16c4-aarch64-neondot-ld128.S.in -D REQUANTIZATION=RNDNU -o src/qu8-igemm/gen/4x16c4-minmax-rndnu-aarch64-neondot-ld128.S &
+tools/xngen src/qu8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in -D REQUANTIZATION=FP32 -o src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S &
+tools/xngen src/qu8-igemm/4x16c4-aarch64-neondot-ld128.S.in -D REQUANTIZATION=FP32 -o src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S &
+
### C8 / C16 micro-kernels
tools/xngen src/qs8-igemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in -D PREFETCH=0 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -o src/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-cortex-a53.S &
tools/xngen src/qs8-igemm/1x8c8-aarch64-neon-mlal-cortex-a53.S.in -D PREFETCH=1 -D REQUANTIZATION=RNDNU -D CHANNELWISE=0 -o src/qs8-igemm/gen/1x8c8-minmax-rndnu-aarch64-neon-mlal-prfm-cortex-a53.S &
diff --git a/src/qu8-gemm/c4-neondot.c.in b/src/qu8-gemm/c4-neondot.c.in
index e84c012..31535de 100644
--- a/src/qu8-gemm/c4-neondot.c.in
+++ b/src/qu8-gemm/c4-neondot.c.in
@@ -6,7 +6,7 @@
$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
$assert NR % 8 == 0
$assert 8 <= NR <= 32
-$assert REQUANTIZATION == "RNDNU"
+$assert REQUANTIZATION in ["FP32", "RNDNU"]
#include <assert.h>
#include <arm_neon.h>
diff --git a/src/qu8-gemm/gen/1x16c4-minmax-fp32-neondot.c b/src/qu8-gemm/gen/1x16c4-minmax-fp32-neondot.c
new file mode 100644
index 0000000..2f6a26e
--- /dev/null
+++ b/src/qu8-gemm/gen/1x16c4-minmax-fp32-neondot.c
@@ -0,0 +1,177 @@
+// Auto-generated file. Do not edit!
+// Template: src/qu8-gemm/c4-neondot.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const uint8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 4 * sizeof(uint8_t));
+ const uint8_t* a0 = a;
+ uint8_t* c0 = c;
+
+ const uint8x8_t va_zero_point = vld1_dup_u8(¶ms->fp32_neonv8.kernel_zero_point[0]);
+
+ // Loop over groups of 16 columns.
+ do {
+ // Initialize accumulators with bias. 16 bias values are loaded from the
+ // weight matrix, at the start of the group of 16 columns.
+ uint32x4_t vpacc0x0123 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0x4567 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0x89AB = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0xCDEF = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x2_t vnacc0 = vmov_n_u32(0);
+
+ // Inner accumulation loop along the 16 columns.
+ size_t k = kc;
+ // 2x partial unrolled loop to load 8 bytes at a time.
+ while (k >= 8 * sizeof(uint8_t)) {
+ // Load a 1x8 block of activations.
+ const uint8x8_t va0x01234567 = vld1_u8(a0); a0 += 8;
+
+ // Load a 8x16 block of weights.
+ const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+
+ // Multiply-accumulate: 1x8 * 8x16 --> 1x16.
+ vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb4567x0123, va0x01234567, 1);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb4567x4567, va0x01234567, 1);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb4567x89AB, va0x01234567, 1);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
+
+ k -= 8 * sizeof(uint8_t);
+ }
+ // Handle up to 4 final positions of `k`
+ if XNN_UNLIKELY(k != 0) {
+ // Load a 1x4 block of activations.
+ const uint8x8_t va0x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a0, vmov_n_u32(0), 0)); a0 += 4;
+
+ // Load a 4x16 block of weights.
+ const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+
+ // Multiply-accumulate: 1x4 * 4x16 --> 1x16.
+ vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
+ }
+
+ // Subtract zero point from accumulators.
+ vnacc0 = vpadd_u32(vnacc0, vnacc0);
+ const uint32x4_t vnacc0x0123 = vcombine_u32(vnacc0, vnacc0);
+ int32x4_t vacc0x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x0123, vnacc0x0123));
+ int32x4_t vacc0x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x4567, vnacc0x0123));
+ int32x4_t vacc0x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc0x89AB, vnacc0x0123));
+ int32x4_t vacc0xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc0xCDEF, vnacc0x0123));
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc0x89AB = vcvtq_f32_s32(vacc0x89AB);
+ float32x4_t vfpacc0xCDEF = vcvtq_f32_s32(vacc0xCDEF);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc0x89AB = vmulq_f32(vfpacc0x89AB, vscale);
+ vfpacc0xCDEF = vmulq_f32(vfpacc0xCDEF, vscale);
+
+ vacc0x0123 = vcvtnq_s32_f32(vfpacc0x0123);
+ vacc0x4567 = vcvtnq_s32_f32(vfpacc0x4567);
+ vacc0x89AB = vcvtnq_s32_f32(vfpacc0x89AB);
+ vacc0xCDEF = vcvtnq_s32_f32(vfpacc0xCDEF);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
+ uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+ uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF));
+#endif
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
+
+ vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min);
+
+ vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_u8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+
+ nc -= 16;
+ } else {
+ uint8x8_t vout0x01234567 = vget_low_u8(vout0x0123456789ABCDEF);
+ if (nc & 8) {
+ vst1_u8(c0, vout0x01234567); c0 += 8;
+ vout0x01234567 = vget_high_u8(vout0x0123456789ABCDEF);
+ }
+ if (nc & 4) {
+ vst1_lane_u32((void*) c0, vreinterpret_u32_u8(vout0x01234567), 0); c0 += 4;
+ vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16((void*) c0, vreinterpret_u16_u8(vout0x01234567), 0); c0 += 2;
+ vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_u8(c0, vout0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-gemm/gen/1x8-minmax-fp32-neon-mlal-lane.c b/src/qu8-gemm/gen/1x8-minmax-fp32-neon-mlal-lane.c
new file mode 100644
index 0000000..5f71724
--- /dev/null
+++ b/src/qu8-gemm/gen/1x8-minmax-fp32-neon-mlal-lane.c
@@ -0,0 +1,213 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/neon-mlal-lane.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const uint8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const uint8_t* a0 = a;
+ uint8_t* c0 = c;
+
+ const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->fp32_neon.kernel_zero_point[0]);
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
+
+ size_t k = kc;
+ while (k >= 8 * sizeof(uint8_t)) {
+ const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
+ const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
+
+ const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+
+
+ const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+
+ k -= 8 * sizeof(uint8_t);
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
+ const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
+
+ const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+
+ if (k >= 2 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+
+ if (k > 2 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+
+ if (k >= 4 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+
+ if (k > 4 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+
+ if (k >= 6 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+
+ if (k > 6 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
+ vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));
+
+ const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
+ vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point);
+ vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point);
+
+#if XNN_ARCH_ARM64
+ int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
+
+
+ uint8x8_t vout0x01234567 = vqmovun_s16(vacc0x01234567);
+#else
+ int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
+
+
+ uint8x8_t vout0x01234567 = vqmovun_s16(vacc0x01234567);
+#endif
+
+ const uint8x8_t voutput_min = vld1_dup_u8(¶ms->fp32_neon.output_min);
+ vout0x01234567 = vmax_u8(vout0x01234567, voutput_min);
+
+ const uint8x8_t voutput_max = vld1_dup_u8(¶ms->fp32_neon.output_max);
+ vout0x01234567 = vmin_u8(vout0x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_u8(c0 + 0, vout0x01234567);
+
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1_lane_u32((void*) c0, vreinterpret_u32_u8(vout0x01234567), 0); c0 += 4;
+ vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16((void*) c0, vreinterpret_u16_u8(vout0x01234567), 0); c0 += 2;
+ vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_u8(c0, vout0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-gemm/gen/2x16c4-minmax-fp32-neondot.c b/src/qu8-gemm/gen/2x16c4-minmax-fp32-neondot.c
new file mode 100644
index 0000000..dd4e3a2
--- /dev/null
+++ b/src/qu8-gemm/gen/2x16c4-minmax-fp32-neondot.c
@@ -0,0 +1,237 @@
+// Auto-generated file. Do not edit!
+// Template: src/qu8-gemm/c4-neondot.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const uint8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 4 * sizeof(uint8_t));
+ const uint8_t* a0 = a;
+ uint8_t* c0 = c;
+ const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+
+ const uint8x8_t va_zero_point = vld1_dup_u8(¶ms->fp32_neonv8.kernel_zero_point[0]);
+
+ // Loop over groups of 16 columns.
+ do {
+ // Initialize accumulators with bias. 16 bias values are loaded from the
+ // weight matrix, at the start of the group of 16 columns.
+ uint32x4_t vpacc0x0123 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0x4567 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0x89AB = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0xCDEF = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc1x0123 = vpacc0x0123;
+ uint32x4_t vpacc1x4567 = vpacc0x4567;
+ uint32x4_t vpacc1x89AB = vpacc0x89AB;
+ uint32x4_t vpacc1xCDEF = vpacc0xCDEF;
+ uint32x2_t vnacc0 = vmov_n_u32(0);
+ uint32x2_t vnacc1 = vmov_n_u32(0);
+
+ // Inner accumulation loop along the 16 columns.
+ size_t k = kc;
+ // 2x partial unrolled loop to load 8 bytes at a time.
+ while (k >= 8 * sizeof(uint8_t)) {
+ // Load a 2x8 block of activations.
+ const uint8x8_t va0x01234567 = vld1_u8(a0); a0 += 8;
+ const uint8x8_t va1x01234567 = vld1_u8(a1); a1 += 8;
+
+ // Load a 8x16 block of weights.
+ const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+
+ // Multiply-accumulate: 2x8 * 8x16 --> 2x16.
+ vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb4567x0123, va0x01234567, 1);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb4567x4567, va0x01234567, 1);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb4567x89AB, va0x01234567, 1);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
+ vnacc1 = vdot_u32(vnacc1, va_zero_point, va1x01234567);
+ vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb0123x0123, va1x01234567, 0);
+ vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb0123x4567, va1x01234567, 0);
+ vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb0123x89AB, va1x01234567, 0);
+ vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb0123xCDEF, va1x01234567, 0);
+ vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb4567x0123, va1x01234567, 1);
+ vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb4567x4567, va1x01234567, 1);
+ vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb4567x89AB, va1x01234567, 1);
+ vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
+
+ k -= 8 * sizeof(uint8_t);
+ }
+ // Handle up to 4 final positions of `k`
+ if XNN_UNLIKELY(k != 0) {
+ // Load a 2x4 block of activations.
+ const uint8x8_t va0x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a0, vmov_n_u32(0), 0)); a0 += 4;
+ const uint8x8_t va1x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a1, vmov_n_u32(0), 0)); a1 += 4;
+
+ // Load a 4x16 block of weights.
+ const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+
+ // Multiply-accumulate: 2x4 * 4x16 --> 2x16.
+ vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
+ vnacc1 = vdot_u32(vnacc1, va_zero_point, va1x01234567);
+ vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb0123x0123, va1x01234567, 0);
+ vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb0123x4567, va1x01234567, 0);
+ vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb0123x89AB, va1x01234567, 0);
+ vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb0123xCDEF, va1x01234567, 0);
+ }
+
+ // Subtract zero point from accumulators.
+ vnacc0 = vpadd_u32(vnacc0, vnacc0);
+ const uint32x4_t vnacc0x0123 = vcombine_u32(vnacc0, vnacc0);
+ int32x4_t vacc0x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x0123, vnacc0x0123));
+ int32x4_t vacc0x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x4567, vnacc0x0123));
+ int32x4_t vacc0x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc0x89AB, vnacc0x0123));
+ int32x4_t vacc0xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc0xCDEF, vnacc0x0123));
+ vnacc1 = vpadd_u32(vnacc1, vnacc1);
+ const uint32x4_t vnacc1x0123 = vcombine_u32(vnacc1, vnacc1);
+ int32x4_t vacc1x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc1x0123, vnacc1x0123));
+ int32x4_t vacc1x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc1x4567, vnacc1x0123));
+ int32x4_t vacc1x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc1x89AB, vnacc1x0123));
+ int32x4_t vacc1xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc1xCDEF, vnacc1x0123));
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc0x89AB = vcvtq_f32_s32(vacc0x89AB);
+ float32x4_t vfpacc0xCDEF = vcvtq_f32_s32(vacc0xCDEF);
+ float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
+ float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);
+ float32x4_t vfpacc1x89AB = vcvtq_f32_s32(vacc1x89AB);
+ float32x4_t vfpacc1xCDEF = vcvtq_f32_s32(vacc1xCDEF);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc0x89AB = vmulq_f32(vfpacc0x89AB, vscale);
+ vfpacc0xCDEF = vmulq_f32(vfpacc0xCDEF, vscale);
+ vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale);
+ vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale);
+ vfpacc1x89AB = vmulq_f32(vfpacc1x89AB, vscale);
+ vfpacc1xCDEF = vmulq_f32(vfpacc1xCDEF, vscale);
+
+ vacc0x0123 = vcvtnq_s32_f32(vfpacc0x0123);
+ vacc0x4567 = vcvtnq_s32_f32(vfpacc0x4567);
+ vacc0x89AB = vcvtnq_s32_f32(vfpacc0x89AB);
+ vacc0xCDEF = vcvtnq_s32_f32(vfpacc0xCDEF);
+ vacc1x0123 = vcvtnq_s32_f32(vfpacc1x0123);
+ vacc1x4567 = vcvtnq_s32_f32(vfpacc1x4567);
+ vacc1x89AB = vcvtnq_s32_f32(vfpacc1x89AB);
+ vacc1xCDEF = vcvtnq_s32_f32(vfpacc1xCDEF);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+
+ uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF);
+ uint8x16_t vout1x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+ uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF));
+ uint8x16_t vout1x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc1x01234567), vqmovun_s16(vacc1x89ABCDEF));
+#endif
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
+
+ vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min);
+ vout1x0123456789ABCDEF = vmaxq_u8(vout1x0123456789ABCDEF, voutput_min);
+
+ vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max);
+ vout1x0123456789ABCDEF = vminq_u8(vout1x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_u8(c0 + 0, vout0x0123456789ABCDEF);
+ vst1q_u8(c1 + 0, vout1x0123456789ABCDEF);
+
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+
+ a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
+
+ nc -= 16;
+ } else {
+ uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vget_low_u8(vout0x0123456789ABCDEF), vget_low_u8(vout1x0123456789ABCDEF));
+ if (nc & 8) {
+ vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567)); c0 += 8;
+ vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567)); c1 += 8;
+ vout0x01234567_1x01234567 = vcombine_u8(vget_high_u8(vout0x0123456789ABCDEF), vget_high_u8(vout1x0123456789ABCDEF));
+ }
+ if (nc & 4) {
+ vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
+ vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S b/src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
new file mode 100644
index 0000000..e2e4986
--- /dev/null
+++ b/src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
@@ -0,0 +1,733 @@
+// Auto-generated file. Do not edit!
+// Template: src/qu8-gemm/4x16c4-aarch64-neondot-cortex-a55.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qu8_conv_minmax_params params) [sp + 8] -> x11
+
+# params structure is 12 bytes
+# struct {
+# uint8_t kernel_zero_point[4];
+# float scale;
+# int16_t output_zero_point;
+# int8_t output_min;
+# int8_t output_max;
+# } fp32_neonv8;
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0 v4
+# A1 x15 v1 v5
+# A2 x13 v2 v6
+# A3 x4 v3 (v0)
+# B x5 v8 v9 v10 v11
+# C0 x6 v16 v20 v24 v28
+# C1 x8 v17 v21 v25 v29
+# C2 x9 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# zero point v7 v12 v13 v14 v15
+
+# x14 temp for Cortex-A55 loads
+
+BEGIN_FUNCTION xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x12, x11, [sp] // cn_stride, params
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+
+ # Save d8-d15 to stack
+ STP d8, d9, [sp, -64]!
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+ BIC x2, x2, 3
+ STP d10, d11, [sp, 16]
+
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ STP d12, d13, [sp, 32]
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+ STP d14, d15, [sp, 48]
+
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+
+ LD1R {v7.4s}, [x11], 4 // kernel_zero_point
+
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+
+ MOVI v12.4s, 0
+ MOVI v13.4s, 0
+ MOVI v14.4s, 0
+ MOVI v15.4s, 0
+
+ LDP q24, q28, [x5], 32
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ SUBS x0, x2, 16 // k = kc - 16
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+
+ # Is there at least 16 bytes for prologue/epilogue?
+ B.LO 4f
+
+ # prologue - read A and B values for block 0 and 1
+ LDR d0, [x3], 8
+ LDR q8, [x5], 16
+ LDR d1, [x15], 8
+ LDR d2, [x13], 8
+ LDR d3, [x4], 8
+ SUBS x0, x0, 16 // is there 16 for main loop?
+ LDR d9, [x5], 8
+ LDR x14, [x5], 8
+ # Is there at least 16 bytes for main loop?
+ B.LO 2f
+
+ # Main loop - 16 bytes of A in 4 groups.
+ # 4 row of 4 vectors wide = 16 UDOT instructions for 4 channels
+ # 4 LD64 for A
+ # 4 LD128 for W. = 2 LD64 + INS.
+ # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS.
+
+ .p2align 3
+1:
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v0.4b[0]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v1.4b[0]
+ INS v9.d[1], x14
+ UDOT v18.4s, v8.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ UDOT v19.4s, v8.16b, v3.4b[0]
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v0.4b[0]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v1.4b[0]
+ INS v10.d[1], x14
+ UDOT v22.4s, v9.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ UDOT v23.4s, v9.16b, v3.4b[0]
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v0.4b[0]
+ LDR d8, [x5], 8
+ UDOT v25.4s, v10.16b, v1.4b[0]
+ INS v11.d[1], x14
+ UDOT v26.4s, v10.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ UDOT v27.4s, v10.16b, v3.4b[0]
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v0.4b[0]
+ LDR d9, [x5], 8
+ UDOT v29.4s, v11.16b, v1.4b[0]
+ INS v8.d[1], x14
+ UDOT v30.4s, v11.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ UDOT v31.4s, v11.16b, v3.4b[0]
+
+ UDOT v12.2s, v7.8b, v0.8b
+ UDOT v13.2s, v7.8b, v1.8b
+ UDOT v14.2s, v7.8b, v2.8b
+ UDOT v15.2s, v7.8b, v3.8b
+
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v0.4b[1]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v1.4b[1]
+ INS v9.d[1], x14
+ UDOT v18.4s, v8.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ UDOT v19.4s, v8.16b, v3.4b[1]
+ LDR d4, [x3], 8
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v0.4b[1]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v1.4b[1]
+ INS v10.d[1], x14
+ UDOT v22.4s, v9.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ UDOT v23.4s, v9.16b, v3.4b[1]
+ LDR d5, [x15], 8
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v0.4b[1]
+ LDR d8, [x5], 8
+ UDOT v25.4s, v10.16b, v1.4b[1]
+ INS v11.d[1], x14
+ UDOT v26.4s, v10.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ UDOT v27.4s, v10.16b, v3.4b[1]
+ LDR d6, [x13], 8
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v0.4b[1]
+ LDR d9, [x5], 8
+ UDOT v29.4s, v11.16b, v1.4b[1]
+ INS v8.d[1], x14
+ UDOT v30.4s, v11.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ UDOT v31.4s, v11.16b, v3.4b[1]
+ LDR d0, [x4], 8
+
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v4.4b[0]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v5.4b[0]
+ INS v9.d[1], x14
+ UDOT v18.4s, v8.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ UDOT v19.4s, v8.16b, v0.4b[0]
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v4.4b[0]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v5.4b[0]
+ INS v10.d[1], x14
+ UDOT v22.4s, v9.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ UDOT v23.4s, v9.16b, v0.4b[0]
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v4.4b[0]
+ LDR d8, [x5], 8
+ UDOT v25.4s, v10.16b, v5.4b[0]
+ INS v11.d[1], x14
+ UDOT v26.4s, v10.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ UDOT v27.4s, v10.16b, v0.4b[0]
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v4.4b[0]
+ LDR d9, [x5], 8
+ UDOT v29.4s, v11.16b, v5.4b[0]
+ INS v8.d[1], x14
+ UDOT v30.4s, v11.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ UDOT v31.4s, v11.16b, v0.4b[0]
+
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v4.4b[1]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v5.4b[1]
+ INS v9.d[1], x14
+ UDOT v18.4s, v8.16b, v6.4b[1]
+ LDR x14, [x5], 8
+ UDOT v19.4s, v8.16b, v0.4b[1]
+ LDR d1, [x15], 8
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v4.4b[1]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v5.4b[1]
+ INS v10.d[1], x14
+ UDOT v22.4s, v9.16b, v6.4b[1]
+ LDR x14, [x5], 8
+ UDOT v23.4s, v9.16b, v0.4b[1]
+ LDR d2, [x13], 8
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v4.4b[1]
+ LDR d8, [x5], 8 // First B values for block 0 and 1
+ UDOT v25.4s, v10.16b, v5.4b[1]
+ INS v11.d[1], x14
+ UDOT v26.4s, v10.16b, v6.4b[1]
+ LDR x14, [x5], 8
+ UDOT v27.4s, v10.16b, v0.4b[1]
+ LDR d3, [x4], 8
+
+ # BLOCK 3 special
+ UDOT v31.4s, v11.16b, v0.4b[1]
+ LDR d9, [x5], 8
+ UDOT v15.2s, v7.8b, v0.8b // free up v0 early
+ INS v8.d[1], x14
+ UDOT v28.4s, v11.16b, v4.4b[1]
+ LDR x14, [x5], 8
+ UDOT v29.4s, v11.16b, v5.4b[1]
+ LDR d0, [x3], 8
+ UDOT v30.4s, v11.16b, v6.4b[1]
+ SUBS x0, x0, 16
+
+ UDOT v12.2s, v7.8b, v4.8b
+ UDOT v13.2s, v7.8b, v5.8b
+ UDOT v14.2s, v7.8b, v6.8b
+ B.HS 1b
+
+ # Epilogue. Same as main loop but no preloads in final group
+2:
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v0.4b[0]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v1.4b[0]
+ INS v9.d[1], x14
+ UDOT v18.4s, v8.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ UDOT v19.4s, v8.16b, v3.4b[0]
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v0.4b[0]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v1.4b[0]
+ INS v10.d[1], x14
+ UDOT v22.4s, v9.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ UDOT v23.4s, v9.16b, v3.4b[0]
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v0.4b[0]
+ LDR d8, [x5], 8
+ UDOT v25.4s, v10.16b, v1.4b[0]
+ INS v11.d[1], x14
+ UDOT v26.4s, v10.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ UDOT v27.4s, v10.16b, v3.4b[0]
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v0.4b[0]
+ LDR d9, [x5], 8
+ UDOT v29.4s, v11.16b, v1.4b[0]
+ INS v8.d[1], x14
+ UDOT v30.4s, v11.16b, v2.4b[0]
+ LDR x14, [x5], 8
+ UDOT v31.4s, v11.16b, v3.4b[0]
+
+ UDOT v12.2s, v7.8b, v0.8b
+ UDOT v13.2s, v7.8b, v1.8b
+ UDOT v14.2s, v7.8b, v2.8b
+ UDOT v15.2s, v7.8b, v3.8b
+
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v0.4b[1]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v1.4b[1]
+ INS v9.d[1], x14
+ UDOT v18.4s, v8.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ UDOT v19.4s, v8.16b, v3.4b[1]
+ LDR d4, [x3], 8
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v0.4b[1]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v1.4b[1]
+ INS v10.d[1], x14
+ UDOT v22.4s, v9.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ UDOT v23.4s, v9.16b, v3.4b[1]
+ LDR d5, [x15], 8
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v0.4b[1]
+ LDR d8, [x5], 8
+ UDOT v25.4s, v10.16b, v1.4b[1]
+ INS v11.d[1], x14
+ UDOT v26.4s, v10.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ UDOT v27.4s, v10.16b, v3.4b[1]
+ LDR d6, [x13], 8
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v0.4b[1]
+ LDR d9, [x5], 8
+ UDOT v29.4s, v11.16b, v1.4b[1]
+ INS v8.d[1], x14
+ UDOT v30.4s, v11.16b, v2.4b[1]
+ LDR x14, [x5], 8
+ UDOT v31.4s, v11.16b, v3.4b[1]
+ LDR d0, [x4], 8
+
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v4.4b[0]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v5.4b[0]
+ INS v9.d[1], x14
+ UDOT v18.4s, v8.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ UDOT v19.4s, v8.16b, v0.4b[0]
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v4.4b[0]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v5.4b[0]
+ INS v10.d[1], x14
+ UDOT v22.4s, v9.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ UDOT v23.4s, v9.16b, v0.4b[0]
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v4.4b[0]
+ LDR d8, [x5], 8
+ UDOT v25.4s, v10.16b, v5.4b[0]
+ INS v11.d[1], x14
+ UDOT v26.4s, v10.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ UDOT v27.4s, v10.16b, v0.4b[0]
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v4.4b[0]
+ LDR d9, [x5], 8
+ UDOT v29.4s, v11.16b, v5.4b[0]
+ INS v8.d[1], x14
+ UDOT v30.4s, v11.16b, v6.4b[0]
+ LDR x14, [x5], 8
+ UDOT v31.4s, v11.16b, v0.4b[0]
+
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v4.4b[1]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v5.4b[1]
+ INS v9.d[1], x14
+ UDOT v18.4s, v8.16b, v6.4b[1]
+ LDR x14, [x5], 8
+ UDOT v19.4s, v8.16b, v0.4b[1]
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v4.4b[1]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v5.4b[1]
+ INS v10.d[1], x14
+ UDOT v22.4s, v9.16b, v6.4b[1]
+ LDR x14, [x5], 8
+ UDOT v23.4s, v9.16b, v0.4b[1]
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v4.4b[1]
+ UDOT v25.4s, v10.16b, v5.4b[1]
+ INS v11.d[1], x14
+ UDOT v26.4s, v10.16b, v6.4b[1]
+ UDOT v27.4s, v10.16b, v0.4b[1]
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v4.4b[1]
+ UDOT v29.4s, v11.16b, v5.4b[1]
+ UDOT v30.4s, v11.16b, v6.4b[1]
+ UDOT v31.4s, v11.16b, v0.4b[1]
+ AND x0, x2, 15 // kc remainder 0 to 12
+
+ UDOT v12.2s, v7.8b, v4.8b
+ UDOT v13.2s, v7.8b, v5.8b
+ UDOT v14.2s, v7.8b, v6.8b
+ UDOT v15.2s, v7.8b, v0.8b
+
+ # Is there a remainder?- 4 to 12 bytes of A
+ CBNZ x0, 4f
+
+3:
+ ADDP v0.2s, v12.2s, v13.2s
+ ADDP v1.2s, v14.2s, v15.2s
+ DUP v12.4s, v0.s[0]
+ DUP v13.4s, v0.s[1]
+ DUP v14.4s, v1.s[0]
+ DUP v15.4s, v1.s[1]
+
+ # Subtract zero point from accumulators
+ SUB v16.4s, v16.4s, v12.4s
+ SUB v17.4s, v17.4s, v13.4s
+ SUB v18.4s, v18.4s, v14.4s
+ SUB v19.4s, v19.4s, v15.4s
+ SUB v20.4s, v20.4s, v12.4s
+ SUB v21.4s, v21.4s, v13.4s
+ SUB v22.4s, v22.4s, v14.4s
+ SUB v23.4s, v23.4s, v15.4s
+ SUB v24.4s, v24.4s, v12.4s
+ SUB v25.4s, v25.4s, v13.4s
+ SUB v26.4s, v26.4s, v14.4s
+ SUB v27.4s, v27.4s, v15.4s
+ SUB v28.4s, v28.4s, v12.4s
+ SUB v29.4s, v29.4s, v13.4s
+ SUB v30.4s, v30.4s, v14.4s
+ SUB v31.4s, v31.4s, v15.4s
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ # Apply params - scale, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v4.4s
+ FMUL v21.4s, v21.4s, v4.4s
+ FMUL v22.4s, v22.4s, v4.4s
+ FMUL v23.4s, v23.4s, v4.4s
+ FMUL v24.4s, v24.4s, v4.4s
+ FMUL v25.4s, v25.4s, v4.4s
+ FMUL v26.4s, v26.4s, v4.4s
+ FMUL v27.4s, v27.4s, v4.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTUN v0.8b, v16.8h
+ SQXTUN v1.8b, v17.8h
+ SQXTUN v2.8b, v18.8h
+ SQXTUN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTUN2 v0.16b, v24.8h
+ SQXTUN2 v1.16b, v25.8h
+ SQXTUN2 v2.16b, v26.8h
+ SQXTUN2 v3.16b, v27.8h
+
+ SUB x11, x11, 7 // rewind params pointer
+
+ UMAX v0.16b, v0.16b, v4.16b
+ UMAX v1.16b, v1.16b, v4.16b
+ UMAX v2.16b, v2.16b, v4.16b
+ UMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ UMIN v0.16b, v0.16b, v5.16b
+ UMIN v1.16b, v1.16b, v5.16b
+ UMIN v2.16b, v2.16b, v5.16b
+ UMIN v3.16b, v3.16b, v5.16b
+ B.LO 6f
+
+ # Store full 4 x 16
+ ST1 {v0.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v1.16b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v2.16b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v3.16b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 64
+ RET
+
+ # Remainder- 4 to 12 bytes of A
+ .p2align 3
+4:
+ TBZ x0, 3, 5f
+
+ LDR d0, [x3], 8
+ LDP q8, q9, [x5], 32
+ LDR d1, [x15], 8
+ LDR d2, [x13], 8
+ LDR d3, [x4], 8
+ LDP q10, q11, [x5], 32
+ UDOT v12.2s, v7.8b, v0.8b
+ UDOT v13.2s, v7.8b, v1.8b
+ UDOT v14.2s, v7.8b, v2.8b
+ UDOT v15.2s, v7.8b, v3.8b
+ UDOT v16.4s, v8.16b, v0.4b[0]
+ UDOT v17.4s, v8.16b, v1.4b[0]
+ UDOT v18.4s, v8.16b, v2.4b[0]
+ UDOT v19.4s, v8.16b, v3.4b[0]
+ UDOT v20.4s, v9.16b, v0.4b[0]
+ UDOT v21.4s, v9.16b, v1.4b[0]
+ UDOT v22.4s, v9.16b, v2.4b[0]
+ UDOT v23.4s, v9.16b, v3.4b[0]
+ UDOT v24.4s, v10.16b, v0.4b[0]
+ UDOT v25.4s, v10.16b, v1.4b[0]
+ UDOT v26.4s, v10.16b, v2.4b[0]
+ UDOT v27.4s, v10.16b, v3.4b[0]
+ UDOT v28.4s, v11.16b, v0.4b[0]
+ UDOT v29.4s, v11.16b, v1.4b[0]
+ UDOT v30.4s, v11.16b, v2.4b[0]
+ UDOT v31.4s, v11.16b, v3.4b[0]
+ LDP q8, q9, [x5], 32
+ LDP q10, q11, [x5], 32
+ UDOT v16.4s, v8.16b, v0.4b[1]
+ UDOT v17.4s, v8.16b, v1.4b[1]
+ UDOT v18.4s, v8.16b, v2.4b[1]
+ UDOT v19.4s, v8.16b, v3.4b[1]
+ UDOT v20.4s, v9.16b, v0.4b[1]
+ UDOT v21.4s, v9.16b, v1.4b[1]
+ UDOT v22.4s, v9.16b, v2.4b[1]
+ UDOT v23.4s, v9.16b, v3.4b[1]
+ UDOT v24.4s, v10.16b, v0.4b[1]
+ UDOT v25.4s, v10.16b, v1.4b[1]
+ UDOT v26.4s, v10.16b, v2.4b[1]
+ UDOT v27.4s, v10.16b, v3.4b[1]
+ UDOT v28.4s, v11.16b, v0.4b[1]
+ UDOT v29.4s, v11.16b, v1.4b[1]
+ UDOT v30.4s, v11.16b, v2.4b[1]
+ UDOT v31.4s, v11.16b, v3.4b[1]
+ TBZ x0, 2, 3b
+5:
+ LDR s0, [x3], 4
+ LDP q8, q9, [x5], 32
+ LDR s1, [x15], 4
+ LDR s2, [x13], 4
+ LDR s3, [x4], 4
+ LDP q10, q11, [x5], 32
+ UDOT v12.2s, v7.8b, v0.8b
+ UDOT v13.2s, v7.8b, v1.8b
+ UDOT v14.2s, v7.8b, v2.8b
+ UDOT v15.2s, v7.8b, v3.8b
+ UDOT v16.4s, v8.16b, v0.4b[0]
+ UDOT v17.4s, v8.16b, v1.4b[0]
+ UDOT v18.4s, v8.16b, v2.4b[0]
+ UDOT v19.4s, v8.16b, v3.4b[0]
+ UDOT v20.4s, v9.16b, v0.4b[0]
+ UDOT v21.4s, v9.16b, v1.4b[0]
+ UDOT v22.4s, v9.16b, v2.4b[0]
+ UDOT v23.4s, v9.16b, v3.4b[0]
+ UDOT v24.4s, v10.16b, v0.4b[0]
+ UDOT v25.4s, v10.16b, v1.4b[0]
+ UDOT v26.4s, v10.16b, v2.4b[0]
+ UDOT v27.4s, v10.16b, v3.4b[0]
+ UDOT v28.4s, v11.16b, v0.4b[0]
+ UDOT v29.4s, v11.16b, v1.4b[0]
+ UDOT v30.4s, v11.16b, v2.4b[0]
+ UDOT v31.4s, v11.16b, v3.4b[0]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+6:
+ TBZ x1, 3, 7f
+ STR d0, [x6], 8
+ STR d1, [x8], 8
+ DUP d0, v0.d[1]
+ DUP d1, v1.d[1]
+ STR d2, [x9], 8
+ STR d3, [x7], 8
+ DUP d2, v2.d[1]
+ DUP d3, v3.d[1]
+7:
+ TBZ x1, 2, 8f
+ STR s0, [x6], 4
+ STR s1, [x8], 4
+ DUP s0, v0.s[1]
+ DUP s1, v1.s[1]
+ STR s2, [x9], 4
+ STR s3, [x7], 4
+ DUP s2, v2.s[1]
+ DUP s3, v3.s[1]
+8:
+ TBZ x1, 1, 9f
+ STR h0, [x6], 2
+ STR h1, [x8], 2
+ DUP h0, v0.h[1]
+ DUP h1, v1.h[1]
+ STR h2, [x9], 2
+ STR h3, [x7], 2
+ DUP h2, v2.h[1]
+ DUP h3, v3.h[1]
+9:
+ TBZ x1, 0, 10f
+ STR b0, [x6]
+ STR b1, [x8]
+ STR b2, [x9]
+ STR b3, [x7]
+10:
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 64
+ RET
+
+END_FUNCTION xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S b/src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
new file mode 100644
index 0000000..26ef020
--- /dev/null
+++ b/src/qu8-gemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
@@ -0,0 +1,493 @@
+// Auto-generated file. Do not edit!
+// Template: src/qu8-gemm/4x16c4-aarch64-neondot-ld128.S.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# const int8_t* restrict a, x3
+# size_t a_stride, x4
+# const void* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> x12
+# const union xnn_qu8_conv_minmax_params params) [sp + 8] -> x11
+
+# params structure is 12 bytes
+# struct {
+# uint8_t kernel_zero_point[4];
+# float scale;
+# int16_t output_zero_point;
+# int8_t output_min;
+# int8_t output_max;
+# } fp32_neonv8;
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x3 v0
+# A1 x15 v1
+# A2 x13 v2
+# A3 x4 v3
+# B x5 v4 v5 v6 v7
+# C0 x6 v16 v20 v24 v28
+# C1 x8 v17 v21 v25 v29
+# C2 x9 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# zero_point v8 v12 v13 v14 v15
+# unused v9 v10 v11
+
+BEGIN_FUNCTION xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
+
+ # Clamp A and C pointers
+ CMP x0, 2 // if mr < 2
+ LDP x12, x11, [sp] // cn_stride, params
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+ ADD x15, x3, x4 // a1 = a0 + a_stride
+ ADD x8, x6, x7 // c1 = c0 + cm_stride
+
+ # Save d8,d12-d15 on stack
+ STR d8, [sp, -48]!
+ CSEL x15, x3, x15, LO // a1 = a0
+ CSEL x8, x6, x8, LO // c1 = c0
+ BIC x2, x2, 3
+
+ STP d12, d13, [sp, 16]
+ ADD x13, x15, x4 // a2 = a1 + a_stride
+ ADD x9, x8, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ CSEL x13, x15, x13, LS // a2 = a1
+ CSEL x9, x8, x9, LS // c2 = c1
+
+ STP d14, d15, [sp, 32]
+ CMP x0, 4 // if mr < 4
+ ADD x4, x13, x4 // a3 = a2 + a_stride
+ ADD x7, x9, x7 // c3 = c2 + cm_stride
+
+ LD1R {v8.4s}, [x11], 4 // kernel_zero_point
+
+ CSEL x4, x13, x4, LO // a3 = a2
+ CSEL x7, x9, x7, LO // c3 = c2
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+
+ MOVI v12.4s, 0
+ MOVI v13.4s, 0
+ MOVI v14.4s, 0
+ MOVI v15.4s, 0
+
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ SUBS x0, x2, 16 // k = kc - 16
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+
+ # Is there at least 16 bytes?
+ B.LO 3f
+
+ # Main loop - 16 bytes of A
+ .p2align 3
+1:
+ LDR q0, [x3], 16
+ LDR q4, [x5], 16
+ LDR q1, [x15], 16
+ LDR q2, [x13], 16
+ LDR q3, [x4], 16
+ LDR q5, [x5], 16
+
+ UDOT v12.4s, v8.16b, v0.16b // update zero point
+ UDOT v13.4s, v8.16b, v1.16b
+ UDOT v14.4s, v8.16b, v2.16b
+ UDOT v15.4s, v8.16b, v3.16b
+
+ UDOT v16.4s, v4.16b, v0.4b[0]
+ UDOT v17.4s, v4.16b, v1.4b[0]
+ LDP q6, q7, [x5], 32
+ UDOT v18.4s, v4.16b, v2.4b[0]
+ UDOT v19.4s, v4.16b, v3.4b[0]
+ UDOT v20.4s, v5.16b, v0.4b[0]
+ UDOT v21.4s, v5.16b, v1.4b[0]
+ UDOT v22.4s, v5.16b, v2.4b[0]
+ UDOT v23.4s, v5.16b, v3.4b[0]
+ UDOT v24.4s, v6.16b, v0.4b[0]
+ UDOT v25.4s, v6.16b, v1.4b[0]
+ LDP q4, q5, [x5], 32
+ UDOT v26.4s, v6.16b, v2.4b[0]
+ UDOT v27.4s, v6.16b, v3.4b[0]
+ UDOT v28.4s, v7.16b, v0.4b[0]
+ UDOT v29.4s, v7.16b, v1.4b[0]
+ UDOT v30.4s, v7.16b, v2.4b[0]
+ UDOT v31.4s, v7.16b, v3.4b[0]
+
+ UDOT v16.4s, v4.16b, v0.4b[1]
+ UDOT v17.4s, v4.16b, v1.4b[1]
+ LDP q6, q7, [x5], 32
+ UDOT v18.4s, v4.16b, v2.4b[1]
+ UDOT v19.4s, v4.16b, v3.4b[1]
+ UDOT v20.4s, v5.16b, v0.4b[1]
+ UDOT v21.4s, v5.16b, v1.4b[1]
+ UDOT v22.4s, v5.16b, v2.4b[1]
+ UDOT v23.4s, v5.16b, v3.4b[1]
+ UDOT v24.4s, v6.16b, v0.4b[1]
+ UDOT v25.4s, v6.16b, v1.4b[1]
+ LDP q4, q5, [x5], 32
+ UDOT v26.4s, v6.16b, v2.4b[1]
+ UDOT v27.4s, v6.16b, v3.4b[1]
+ UDOT v28.4s, v7.16b, v0.4b[1]
+ UDOT v29.4s, v7.16b, v1.4b[1]
+ UDOT v30.4s, v7.16b, v2.4b[1]
+ UDOT v31.4s, v7.16b, v3.4b[1]
+
+ UDOT v16.4s, v4.16b, v0.4b[2]
+ UDOT v17.4s, v4.16b, v1.4b[2]
+ LDP q6, q7, [x5], 32
+ UDOT v18.4s, v4.16b, v2.4b[2]
+ UDOT v19.4s, v4.16b, v3.4b[2]
+ UDOT v20.4s, v5.16b, v0.4b[2]
+ UDOT v21.4s, v5.16b, v1.4b[2]
+ UDOT v22.4s, v5.16b, v2.4b[2]
+ UDOT v23.4s, v5.16b, v3.4b[2]
+ UDOT v24.4s, v6.16b, v0.4b[2]
+ UDOT v25.4s, v6.16b, v1.4b[2]
+ LDP q4, q5, [x5], 32
+ UDOT v26.4s, v6.16b, v2.4b[2]
+ UDOT v27.4s, v6.16b, v3.4b[2]
+ UDOT v28.4s, v7.16b, v0.4b[2]
+ UDOT v29.4s, v7.16b, v1.4b[2]
+ UDOT v30.4s, v7.16b, v2.4b[2]
+ UDOT v31.4s, v7.16b, v3.4b[2]
+
+ UDOT v16.4s, v4.16b, v0.4b[3]
+ UDOT v17.4s, v4.16b, v1.4b[3]
+ LDP q6, q7, [x5], 32
+ UDOT v18.4s, v4.16b, v2.4b[3]
+ UDOT v19.4s, v4.16b, v3.4b[3]
+ UDOT v20.4s, v5.16b, v0.4b[3]
+ UDOT v21.4s, v5.16b, v1.4b[3]
+ UDOT v22.4s, v5.16b, v2.4b[3]
+ UDOT v23.4s, v5.16b, v3.4b[3]
+ UDOT v24.4s, v6.16b, v0.4b[3]
+ UDOT v25.4s, v6.16b, v1.4b[3]
+ UDOT v26.4s, v6.16b, v2.4b[3]
+ UDOT v27.4s, v6.16b, v3.4b[3]
+ SUBS x0, x0, 16
+ UDOT v28.4s, v7.16b, v0.4b[3]
+ UDOT v29.4s, v7.16b, v1.4b[3]
+ UDOT v30.4s, v7.16b, v2.4b[3]
+ UDOT v31.4s, v7.16b, v3.4b[3]
+ B.HS 1b
+
+ # Is there a remainder?- 4 to 12 bytes of A
+ TST x0, 15
+ B.NE 3f
+
+2:
+ ADDP v0.4s, v12.4s, v12.4s
+ ADDP v1.4s, v13.4s, v13.4s
+ ADDP v2.4s, v14.4s, v14.4s
+ ADDP v3.4s, v15.4s, v15.4s
+ ADDP v12.4s, v0.4s, v0.4s
+ ADDP v13.4s, v1.4s, v1.4s
+ ADDP v14.4s, v2.4s, v2.4s
+ ADDP v15.4s, v3.4s, v3.4s
+
+ # Subtract zero point from accumulators
+ SUB v16.4s, v16.4s, v12.4s
+ SUB v17.4s, v17.4s, v13.4s
+ SUB v18.4s, v18.4s, v14.4s
+ SUB v19.4s, v19.4s, v15.4s
+ SUB v20.4s, v20.4s, v12.4s
+ SUB v21.4s, v21.4s, v13.4s
+ SUB v22.4s, v22.4s, v14.4s
+ SUB v23.4s, v23.4s, v15.4s
+ SUB v24.4s, v24.4s, v12.4s
+ SUB v25.4s, v25.4s, v13.4s
+ SUB v26.4s, v26.4s, v14.4s
+ SUB v27.4s, v27.4s, v15.4s
+ SUB v28.4s, v28.4s, v12.4s
+ SUB v29.4s, v29.4s, v13.4s
+ SUB v30.4s, v30.4s, v14.4s
+ SUB v31.4s, v31.4s, v15.4s
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ # Apply params - scale, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v4.4s
+ FMUL v21.4s, v21.4s, v4.4s
+ FMUL v22.4s, v22.4s, v4.4s
+ FMUL v23.4s, v23.4s, v4.4s
+ FMUL v24.4s, v24.4s, v4.4s
+ FMUL v25.4s, v25.4s, v4.4s
+ FMUL v26.4s, v26.4s, v4.4s
+ FMUL v27.4s, v27.4s, v4.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTUN v0.8b, v16.8h
+ SQXTUN v1.8b, v17.8h
+ SQXTUN v2.8b, v18.8h
+ SQXTUN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTUN2 v0.16b, v24.8h
+ SQXTUN2 v1.16b, v25.8h
+ SQXTUN2 v2.16b, v26.8h
+ SQXTUN2 v3.16b, v27.8h
+
+ SUB x11, x11, 7 // rewind params pointer
+
+ UMAX v0.16b, v0.16b, v4.16b
+ UMAX v1.16b, v1.16b, v4.16b
+ UMAX v2.16b, v2.16b, v4.16b
+ UMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ UMIN v0.16b, v0.16b, v5.16b
+ UMIN v1.16b, v1.16b, v5.16b
+ UMIN v2.16b, v2.16b, v5.16b
+ UMIN v3.16b, v3.16b, v5.16b
+ B.LO 5f
+
+ # Store full 4 x 16
+ ST1 {v0.16b}, [x6], x12
+ SUB x3, x3, x2 // a0 -= kc
+ ST1 {v1.16b}, [x8], x12
+ SUB x15, x15, x2 // a1 -= kc
+ ST1 {v2.16b}, [x9], x12
+ SUB x13, x13, x2 // a2 -= kc
+ ST1 {v3.16b}, [x7], x12
+ SUB x4, x4, x2 // a3 -= kc
+ B.NE 0b
+
+ # Restore d8,d12-d15 from stack
+ LDP d14, d15, [sp, 32]
+ LDP d12, d13, [sp, 16]
+ LDR d8, [sp], 48
+ RET
+
+ # Remainder- 8 bytes of A
+ .p2align 3
+3:
+ # Is there a remainder?- 8 bytes of A
+ TBZ x0, 3, 4f
+
+ LDR d0, [x3], 8
+ LDR q4, [x5], 16
+ LDR d1, [x15], 8
+ LDR d2, [x13], 8
+ LDR d3, [x4], 8
+ LDR q5, [x5], 16
+
+ UDOT v12.4s, v8.16b, v0.16b // update zero point
+ UDOT v13.4s, v8.16b, v1.16b
+ UDOT v14.4s, v8.16b, v2.16b
+ UDOT v15.4s, v8.16b, v3.16b
+
+ UDOT v16.4s, v4.16b, v0.4b[0]
+ UDOT v17.4s, v4.16b, v1.4b[0]
+ LDP q6, q7, [x5], 32
+ UDOT v18.4s, v4.16b, v2.4b[0]
+ UDOT v19.4s, v4.16b, v3.4b[0]
+ UDOT v20.4s, v5.16b, v0.4b[0]
+ UDOT v21.4s, v5.16b, v1.4b[0]
+ UDOT v22.4s, v5.16b, v2.4b[0]
+ UDOT v23.4s, v5.16b, v3.4b[0]
+ UDOT v24.4s, v6.16b, v0.4b[0]
+ UDOT v25.4s, v6.16b, v1.4b[0]
+ LDP q4, q5, [x5], 32
+ UDOT v26.4s, v6.16b, v2.4b[0]
+ UDOT v27.4s, v6.16b, v3.4b[0]
+ UDOT v28.4s, v7.16b, v0.4b[0]
+ UDOT v29.4s, v7.16b, v1.4b[0]
+ UDOT v30.4s, v7.16b, v2.4b[0]
+ UDOT v31.4s, v7.16b, v3.4b[0]
+ UDOT v16.4s, v4.16b, v0.4b[1]
+ UDOT v17.4s, v4.16b, v1.4b[1]
+ LDP q6, q7, [x5], 32
+ UDOT v18.4s, v4.16b, v2.4b[1]
+ UDOT v19.4s, v4.16b, v3.4b[1]
+ UDOT v20.4s, v5.16b, v0.4b[1]
+ UDOT v21.4s, v5.16b, v1.4b[1]
+ UDOT v22.4s, v5.16b, v2.4b[1]
+ UDOT v23.4s, v5.16b, v3.4b[1]
+ UDOT v24.4s, v6.16b, v0.4b[1]
+ UDOT v25.4s, v6.16b, v1.4b[1]
+ UDOT v26.4s, v6.16b, v2.4b[1]
+ UDOT v27.4s, v6.16b, v3.4b[1]
+ UDOT v28.4s, v7.16b, v0.4b[1]
+ UDOT v29.4s, v7.16b, v1.4b[1]
+ UDOT v30.4s, v7.16b, v2.4b[1]
+ UDOT v31.4s, v7.16b, v3.4b[1]
+ # Is there a remainder?- 4 bytes of A
+ TBZ x0, 2, 2b
+
+ # Remainder- 4 bytes of A
+4:
+ LDR s0, [x3], 4
+ LDR q4, [x5], 16
+ LDR s1, [x15], 4
+ LDR s2, [x13], 4
+ LDR s3, [x4], 4
+ LDR q5, [x5], 16
+
+ UDOT v12.4s, v8.16b, v0.16b // update zero point
+ UDOT v13.4s, v8.16b, v1.16b
+ UDOT v14.4s, v8.16b, v2.16b
+ UDOT v15.4s, v8.16b, v3.16b
+
+ UDOT v16.4s, v4.16b, v0.4b[0]
+ UDOT v17.4s, v4.16b, v1.4b[0]
+ UDOT v18.4s, v4.16b, v2.4b[0]
+ UDOT v19.4s, v4.16b, v3.4b[0]
+ LDP q6, q7, [x5], 32
+ UDOT v20.4s, v5.16b, v0.4b[0]
+ UDOT v21.4s, v5.16b, v1.4b[0]
+ UDOT v22.4s, v5.16b, v2.4b[0]
+ UDOT v23.4s, v5.16b, v3.4b[0]
+ UDOT v24.4s, v6.16b, v0.4b[0]
+ UDOT v25.4s, v6.16b, v1.4b[0]
+ UDOT v26.4s, v6.16b, v2.4b[0]
+ UDOT v27.4s, v6.16b, v3.4b[0]
+ UDOT v28.4s, v7.16b, v0.4b[0]
+ UDOT v29.4s, v7.16b, v1.4b[0]
+ UDOT v30.4s, v7.16b, v2.4b[0]
+ UDOT v31.4s, v7.16b, v3.4b[0]
+ B 2b
+
+ # Store odd width
+ .p2align 3
+5:
+ TBZ x1, 3, 6f
+ STR d0, [x6], 8
+ STR d1, [x8], 8
+ DUP d0, v0.d[1]
+ DUP d1, v1.d[1]
+ STR d2, [x9], 8
+ STR d3, [x7], 8
+ DUP d2, v2.d[1]
+ DUP d3, v3.d[1]
+6:
+ TBZ x1, 2, 7f
+ STR s0, [x6], 4
+ STR s1, [x8], 4
+ DUP s0, v0.s[1]
+ DUP s1, v1.s[1]
+ STR s2, [x9], 4
+ STR s3, [x7], 4
+ DUP s2, v2.s[1]
+ DUP s3, v3.s[1]
+7:
+ TBZ x1, 1, 8f
+ STR h0, [x6], 2
+ STR h1, [x8], 2
+ DUP h0, v0.h[1]
+ DUP h1, v1.h[1]
+ STR h2, [x9], 2
+ STR h3, [x7], 2
+ DUP h2, v2.h[1]
+ DUP h3, v3.h[1]
+8:
+ TBZ x1, 0, 9f
+ STR b0, [x6]
+ STR b1, [x8]
+ STR b2, [x9]
+ STR b3, [x7]
+9:
+ # Restore d8,d12-d15 from stack
+ LDP d14, d15, [sp, 32]
+ LDP d12, d13, [sp, 16]
+ LDR d8, [sp], 48
+ RET
+
+END_FUNCTION xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qu8-gemm/gen/4x16c4-minmax-fp32-neondot.c b/src/qu8-gemm/gen/4x16c4-minmax-fp32-neondot.c
new file mode 100644
index 0000000..8751d3b
--- /dev/null
+++ b/src/qu8-gemm/gen/4x16c4-minmax-fp32-neondot.c
@@ -0,0 +1,361 @@
+// Auto-generated file. Do not edit!
+// Template: src/qu8-gemm/c4-neondot.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const uint8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 4 * sizeof(uint8_t));
+ const uint8_t* a0 = a;
+ uint8_t* c0 = c;
+ const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
+ uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride);
+ uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ const uint8x8_t va_zero_point = vld1_dup_u8(¶ms->fp32_neonv8.kernel_zero_point[0]);
+
+ // Loop over groups of 16 columns.
+ do {
+ // Initialize accumulators with bias. 16 bias values are loaded from the
+ // weight matrix, at the start of the group of 16 columns.
+ uint32x4_t vpacc0x0123 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0x4567 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0x89AB = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0xCDEF = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc1x0123 = vpacc0x0123;
+ uint32x4_t vpacc1x4567 = vpacc0x4567;
+ uint32x4_t vpacc1x89AB = vpacc0x89AB;
+ uint32x4_t vpacc1xCDEF = vpacc0xCDEF;
+ uint32x4_t vpacc2x0123 = vpacc0x0123;
+ uint32x4_t vpacc2x4567 = vpacc0x4567;
+ uint32x4_t vpacc2x89AB = vpacc0x89AB;
+ uint32x4_t vpacc2xCDEF = vpacc0xCDEF;
+ uint32x4_t vpacc3x0123 = vpacc0x0123;
+ uint32x4_t vpacc3x4567 = vpacc0x4567;
+ uint32x4_t vpacc3x89AB = vpacc0x89AB;
+ uint32x4_t vpacc3xCDEF = vpacc0xCDEF;
+ uint32x2_t vnacc0 = vmov_n_u32(0);
+ uint32x2_t vnacc1 = vmov_n_u32(0);
+ uint32x2_t vnacc2 = vmov_n_u32(0);
+ uint32x2_t vnacc3 = vmov_n_u32(0);
+
+ // Inner accumulation loop along the 16 columns.
+ size_t k = kc;
+ // 2x partial unrolled loop to load 8 bytes at a time.
+ while (k >= 8 * sizeof(uint8_t)) {
+ // Load a 4x8 block of activations.
+ const uint8x8_t va0x01234567 = vld1_u8(a0); a0 += 8;
+ const uint8x8_t va1x01234567 = vld1_u8(a1); a1 += 8;
+ const uint8x8_t va2x01234567 = vld1_u8(a2); a2 += 8;
+ const uint8x8_t va3x01234567 = vld1_u8(a3); a3 += 8;
+
+ // Load a 8x16 block of weights.
+ const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+
+ // Multiply-accumulate: 4x8 * 8x16 --> 4x16.
+ vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb4567x0123, va0x01234567, 1);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb4567x4567, va0x01234567, 1);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb4567x89AB, va0x01234567, 1);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
+ vnacc1 = vdot_u32(vnacc1, va_zero_point, va1x01234567);
+ vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb0123x0123, va1x01234567, 0);
+ vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb0123x4567, va1x01234567, 0);
+ vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb0123x89AB, va1x01234567, 0);
+ vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb0123xCDEF, va1x01234567, 0);
+ vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb4567x0123, va1x01234567, 1);
+ vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb4567x4567, va1x01234567, 1);
+ vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb4567x89AB, va1x01234567, 1);
+ vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
+ vnacc2 = vdot_u32(vnacc2, va_zero_point, va2x01234567);
+ vpacc2x0123 = vdotq_lane_u32(vpacc2x0123, vb0123x0123, va2x01234567, 0);
+ vpacc2x4567 = vdotq_lane_u32(vpacc2x4567, vb0123x4567, va2x01234567, 0);
+ vpacc2x89AB = vdotq_lane_u32(vpacc2x89AB, vb0123x89AB, va2x01234567, 0);
+ vpacc2xCDEF = vdotq_lane_u32(vpacc2xCDEF, vb0123xCDEF, va2x01234567, 0);
+ vpacc2x0123 = vdotq_lane_u32(vpacc2x0123, vb4567x0123, va2x01234567, 1);
+ vpacc2x4567 = vdotq_lane_u32(vpacc2x4567, vb4567x4567, va2x01234567, 1);
+ vpacc2x89AB = vdotq_lane_u32(vpacc2x89AB, vb4567x89AB, va2x01234567, 1);
+ vpacc2xCDEF = vdotq_lane_u32(vpacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
+ vnacc3 = vdot_u32(vnacc3, va_zero_point, va3x01234567);
+ vpacc3x0123 = vdotq_lane_u32(vpacc3x0123, vb0123x0123, va3x01234567, 0);
+ vpacc3x4567 = vdotq_lane_u32(vpacc3x4567, vb0123x4567, va3x01234567, 0);
+ vpacc3x89AB = vdotq_lane_u32(vpacc3x89AB, vb0123x89AB, va3x01234567, 0);
+ vpacc3xCDEF = vdotq_lane_u32(vpacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
+ vpacc3x0123 = vdotq_lane_u32(vpacc3x0123, vb4567x0123, va3x01234567, 1);
+ vpacc3x4567 = vdotq_lane_u32(vpacc3x4567, vb4567x4567, va3x01234567, 1);
+ vpacc3x89AB = vdotq_lane_u32(vpacc3x89AB, vb4567x89AB, va3x01234567, 1);
+ vpacc3xCDEF = vdotq_lane_u32(vpacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
+
+ k -= 8 * sizeof(uint8_t);
+ }
+ // Handle up to 4 final positions of `k`
+ if XNN_UNLIKELY(k != 0) {
+ // Load a 4x4 block of activations.
+ const uint8x8_t va0x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a0, vmov_n_u32(0), 0)); a0 += 4;
+ const uint8x8_t va1x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a1, vmov_n_u32(0), 0)); a1 += 4;
+ const uint8x8_t va2x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a2, vmov_n_u32(0), 0)); a2 += 4;
+ const uint8x8_t va3x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a3, vmov_n_u32(0), 0)); a3 += 4;
+
+ // Load a 4x16 block of weights.
+ const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+
+ // Multiply-accumulate: 4x4 * 4x16 --> 4x16.
+ vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
+ vnacc1 = vdot_u32(vnacc1, va_zero_point, va1x01234567);
+ vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb0123x0123, va1x01234567, 0);
+ vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb0123x4567, va1x01234567, 0);
+ vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb0123x89AB, va1x01234567, 0);
+ vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb0123xCDEF, va1x01234567, 0);
+ vnacc2 = vdot_u32(vnacc2, va_zero_point, va2x01234567);
+ vpacc2x0123 = vdotq_lane_u32(vpacc2x0123, vb0123x0123, va2x01234567, 0);
+ vpacc2x4567 = vdotq_lane_u32(vpacc2x4567, vb0123x4567, va2x01234567, 0);
+ vpacc2x89AB = vdotq_lane_u32(vpacc2x89AB, vb0123x89AB, va2x01234567, 0);
+ vpacc2xCDEF = vdotq_lane_u32(vpacc2xCDEF, vb0123xCDEF, va2x01234567, 0);
+ vnacc3 = vdot_u32(vnacc3, va_zero_point, va3x01234567);
+ vpacc3x0123 = vdotq_lane_u32(vpacc3x0123, vb0123x0123, va3x01234567, 0);
+ vpacc3x4567 = vdotq_lane_u32(vpacc3x4567, vb0123x4567, va3x01234567, 0);
+ vpacc3x89AB = vdotq_lane_u32(vpacc3x89AB, vb0123x89AB, va3x01234567, 0);
+ vpacc3xCDEF = vdotq_lane_u32(vpacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
+ }
+
+ // Subtract zero point from accumulators.
+ vnacc0 = vpadd_u32(vnacc0, vnacc0);
+ const uint32x4_t vnacc0x0123 = vcombine_u32(vnacc0, vnacc0);
+ int32x4_t vacc0x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x0123, vnacc0x0123));
+ int32x4_t vacc0x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x4567, vnacc0x0123));
+ int32x4_t vacc0x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc0x89AB, vnacc0x0123));
+ int32x4_t vacc0xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc0xCDEF, vnacc0x0123));
+ vnacc1 = vpadd_u32(vnacc1, vnacc1);
+ const uint32x4_t vnacc1x0123 = vcombine_u32(vnacc1, vnacc1);
+ int32x4_t vacc1x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc1x0123, vnacc1x0123));
+ int32x4_t vacc1x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc1x4567, vnacc1x0123));
+ int32x4_t vacc1x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc1x89AB, vnacc1x0123));
+ int32x4_t vacc1xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc1xCDEF, vnacc1x0123));
+ vnacc2 = vpadd_u32(vnacc2, vnacc2);
+ const uint32x4_t vnacc2x0123 = vcombine_u32(vnacc2, vnacc2);
+ int32x4_t vacc2x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc2x0123, vnacc2x0123));
+ int32x4_t vacc2x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc2x4567, vnacc2x0123));
+ int32x4_t vacc2x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc2x89AB, vnacc2x0123));
+ int32x4_t vacc2xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc2xCDEF, vnacc2x0123));
+ vnacc3 = vpadd_u32(vnacc3, vnacc3);
+ const uint32x4_t vnacc3x0123 = vcombine_u32(vnacc3, vnacc3);
+ int32x4_t vacc3x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc3x0123, vnacc3x0123));
+ int32x4_t vacc3x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc3x4567, vnacc3x0123));
+ int32x4_t vacc3x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc3x89AB, vnacc3x0123));
+ int32x4_t vacc3xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc3xCDEF, vnacc3x0123));
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc0x89AB = vcvtq_f32_s32(vacc0x89AB);
+ float32x4_t vfpacc0xCDEF = vcvtq_f32_s32(vacc0xCDEF);
+ float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
+ float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);
+ float32x4_t vfpacc1x89AB = vcvtq_f32_s32(vacc1x89AB);
+ float32x4_t vfpacc1xCDEF = vcvtq_f32_s32(vacc1xCDEF);
+ float32x4_t vfpacc2x0123 = vcvtq_f32_s32(vacc2x0123);
+ float32x4_t vfpacc2x4567 = vcvtq_f32_s32(vacc2x4567);
+ float32x4_t vfpacc2x89AB = vcvtq_f32_s32(vacc2x89AB);
+ float32x4_t vfpacc2xCDEF = vcvtq_f32_s32(vacc2xCDEF);
+ float32x4_t vfpacc3x0123 = vcvtq_f32_s32(vacc3x0123);
+ float32x4_t vfpacc3x4567 = vcvtq_f32_s32(vacc3x4567);
+ float32x4_t vfpacc3x89AB = vcvtq_f32_s32(vacc3x89AB);
+ float32x4_t vfpacc3xCDEF = vcvtq_f32_s32(vacc3xCDEF);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc0x89AB = vmulq_f32(vfpacc0x89AB, vscale);
+ vfpacc0xCDEF = vmulq_f32(vfpacc0xCDEF, vscale);
+ vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale);
+ vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale);
+ vfpacc1x89AB = vmulq_f32(vfpacc1x89AB, vscale);
+ vfpacc1xCDEF = vmulq_f32(vfpacc1xCDEF, vscale);
+ vfpacc2x0123 = vmulq_f32(vfpacc2x0123, vscale);
+ vfpacc2x4567 = vmulq_f32(vfpacc2x4567, vscale);
+ vfpacc2x89AB = vmulq_f32(vfpacc2x89AB, vscale);
+ vfpacc2xCDEF = vmulq_f32(vfpacc2xCDEF, vscale);
+ vfpacc3x0123 = vmulq_f32(vfpacc3x0123, vscale);
+ vfpacc3x4567 = vmulq_f32(vfpacc3x4567, vscale);
+ vfpacc3x89AB = vmulq_f32(vfpacc3x89AB, vscale);
+ vfpacc3xCDEF = vmulq_f32(vfpacc3xCDEF, vscale);
+
+ vacc0x0123 = vcvtnq_s32_f32(vfpacc0x0123);
+ vacc0x4567 = vcvtnq_s32_f32(vfpacc0x4567);
+ vacc0x89AB = vcvtnq_s32_f32(vfpacc0x89AB);
+ vacc0xCDEF = vcvtnq_s32_f32(vfpacc0xCDEF);
+ vacc1x0123 = vcvtnq_s32_f32(vfpacc1x0123);
+ vacc1x4567 = vcvtnq_s32_f32(vfpacc1x4567);
+ vacc1x89AB = vcvtnq_s32_f32(vfpacc1x89AB);
+ vacc1xCDEF = vcvtnq_s32_f32(vfpacc1xCDEF);
+ vacc2x0123 = vcvtnq_s32_f32(vfpacc2x0123);
+ vacc2x4567 = vcvtnq_s32_f32(vfpacc2x4567);
+ vacc2x89AB = vcvtnq_s32_f32(vfpacc2x89AB);
+ vacc2xCDEF = vcvtnq_s32_f32(vfpacc2xCDEF);
+ vacc3x0123 = vcvtnq_s32_f32(vfpacc3x0123);
+ vacc3x4567 = vcvtnq_s32_f32(vfpacc3x4567);
+ vacc3x89AB = vcvtnq_s32_f32(vfpacc3x89AB);
+ vacc3xCDEF = vcvtnq_s32_f32(vfpacc3xCDEF);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+ const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+ uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF);
+ uint8x16_t vout1x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc1x01234567), vacc1x89ABCDEF);
+ uint8x16_t vout2x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc2x89ABCDEF);
+ uint8x16_t vout3x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+ const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+ uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF));
+ uint8x16_t vout1x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc1x01234567), vqmovun_s16(vacc1x89ABCDEF));
+ uint8x16_t vout2x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc2x89ABCDEF));
+ uint8x16_t vout3x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc3x01234567), vqmovun_s16(vacc3x89ABCDEF));
+#endif
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
+
+ vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min);
+ vout1x0123456789ABCDEF = vmaxq_u8(vout1x0123456789ABCDEF, voutput_min);
+ vout2x0123456789ABCDEF = vmaxq_u8(vout2x0123456789ABCDEF, voutput_min);
+ vout3x0123456789ABCDEF = vmaxq_u8(vout3x0123456789ABCDEF, voutput_min);
+
+ vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max);
+ vout1x0123456789ABCDEF = vminq_u8(vout1x0123456789ABCDEF, voutput_max);
+ vout2x0123456789ABCDEF = vminq_u8(vout2x0123456789ABCDEF, voutput_max);
+ vout3x0123456789ABCDEF = vminq_u8(vout3x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_u8(c0 + 0, vout0x0123456789ABCDEF);
+ vst1q_u8(c1 + 0, vout1x0123456789ABCDEF);
+ vst1q_u8(c2 + 0, vout2x0123456789ABCDEF);
+ vst1q_u8(c3 + 0, vout3x0123456789ABCDEF);
+
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
+ c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
+
+ a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
+ a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
+ a3 = (const uint8_t*) ((uintptr_t) a3 - kc);
+
+ nc -= 16;
+ } else {
+ uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vget_low_u8(vout0x0123456789ABCDEF), vget_low_u8(vout1x0123456789ABCDEF));
+ uint8x16_t vout2x01234567_3x01234567 = vcombine_u8(vget_low_u8(vout2x0123456789ABCDEF), vget_low_u8(vout3x0123456789ABCDEF));
+ if (nc & 8) {
+ vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567)); c0 += 8;
+ vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567)); c1 += 8;
+ vst1_u8(c2, vget_low_u8(vout2x01234567_3x01234567)); c2 += 8;
+ vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567)); c3 += 8;
+ vout0x01234567_1x01234567 = vcombine_u8(vget_high_u8(vout0x0123456789ABCDEF), vget_high_u8(vout1x0123456789ABCDEF));
+ vout2x01234567_3x01234567 = vcombine_u8(vget_high_u8(vout2x0123456789ABCDEF), vget_high_u8(vout3x0123456789ABCDEF));
+ }
+ if (nc & 4) {
+ vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32((void*) c2, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 0); c2 += 4;
+ vst1q_lane_u32((void*) c3, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 2); c3 += 4;
+ vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16((void*) c2, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 0); c2 += 2;
+ vst1q_lane_u16((void*) c3, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 4); c3 += 2;
+ vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
+ vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
+ vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-gemm/gen/4x8-minmax-fp32-neon-mlal-lane.c b/src/qu8-gemm/gen/4x8-minmax-fp32-neon-mlal-lane.c
new file mode 100644
index 0000000..fede3ee
--- /dev/null
+++ b/src/qu8-gemm/gen/4x8-minmax-fp32-neon-mlal-lane.c
@@ -0,0 +1,393 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/neon-mlal-lane.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const uint8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const uint8_t* a0 = a;
+ uint8_t* c0 = c;
+ const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const uint8_t* a2 = (const uint8_t*) ((uintptr_t) a1 + a_stride);
+ uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+ const uint8_t* a3 = (const uint8_t*) ((uintptr_t) a2 + a_stride);
+ uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ a3 = a2;
+ c3 = c2;
+ }
+
+ const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->fp32_neon.kernel_zero_point[0]);
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc2x0123 = vacc0x0123;
+ int32x4_t vacc2x4567 = vacc0x4567;
+ int32x4_t vacc3x0123 = vacc0x0123;
+ int32x4_t vacc3x4567 = vacc0x4567;
+
+ size_t k = kc;
+ while (k >= 8 * sizeof(uint8_t)) {
+ const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
+ const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
+ const uint8x8_t va1 = vld1_u8(a1); a1 += 8;
+ const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
+ const uint8x8_t va2 = vld1_u8(a2); a2 += 8;
+ const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
+ const uint8x8_t va3 = vld1_u8(a3); a3 += 8;
+ const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));
+
+ const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+
+
+ const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+
+ k -= 8 * sizeof(uint8_t);
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
+ const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
+ const uint8x8_t va1 = vld1_u8(a1); a1 = (const uint8_t*) ((uintptr_t) a1 + k);
+ const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
+ const uint8x8_t va2 = vld1_u8(a2); a2 = (const uint8_t*) ((uintptr_t) a2 + k);
+ const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
+ const uint8x8_t va3 = vld1_u8(a3); a3 = (const uint8_t*) ((uintptr_t) a3 + k);
+ const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));
+
+ const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+
+ if (k >= 2 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+
+ if (k > 2 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+
+ if (k >= 4 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+
+ if (k > 4 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+
+ if (k >= 6 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+
+ if (k > 6 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
+ float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);
+ float32x4_t vfpacc2x0123 = vcvtq_f32_s32(vacc2x0123);
+ float32x4_t vfpacc2x4567 = vcvtq_f32_s32(vacc2x4567);
+ float32x4_t vfpacc3x0123 = vcvtq_f32_s32(vacc3x0123);
+ float32x4_t vfpacc3x4567 = vcvtq_f32_s32(vacc3x4567);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale);
+ vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale);
+ vfpacc2x0123 = vmulq_f32(vfpacc2x0123, vscale);
+ vfpacc2x4567 = vmulq_f32(vfpacc2x4567, vscale);
+ vfpacc3x0123 = vmulq_f32(vfpacc3x0123, vscale);
+ vfpacc3x4567 = vmulq_f32(vfpacc3x4567, vscale);
+
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
+ vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));
+ vacc1x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x0123, vmagic_bias));
+ vacc1x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x4567, vmagic_bias));
+ vacc2x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc2x0123, vmagic_bias));
+ vacc2x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc2x4567, vmagic_bias));
+ vacc3x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc3x0123, vmagic_bias));
+ vacc3x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc3x4567, vmagic_bias));
+
+ const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
+ vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point);
+ vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point);
+ vacc1x0123 = vqsubq_s32(vacc1x0123, vmagic_bias_less_output_zero_point);
+ vacc1x4567 = vqsubq_s32(vacc1x4567, vmagic_bias_less_output_zero_point);
+ vacc2x0123 = vqsubq_s32(vacc2x0123, vmagic_bias_less_output_zero_point);
+ vacc2x4567 = vqsubq_s32(vacc2x4567, vmagic_bias_less_output_zero_point);
+ vacc3x0123 = vqsubq_s32(vacc3x0123, vmagic_bias_less_output_zero_point);
+ vacc3x4567 = vqsubq_s32(vacc3x4567, vmagic_bias_less_output_zero_point);
+
+#if XNN_ARCH_ARM64
+ int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
+ int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567);
+ int16x8_t vacc2x01234567 = vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567);
+ int16x8_t vacc3x01234567 = vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567);
+
+
+ uint8x16_t vout0x01234567_1x01234567 = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc1x01234567);
+ uint8x16_t vout2x01234567_3x01234567 = vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc3x01234567);
+#else
+ int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
+ int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));
+ int16x8_t vacc2x01234567 = vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567));
+ int16x8_t vacc3x01234567 = vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567));
+
+
+ uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc1x01234567));
+ uint8x16_t vout2x01234567_3x01234567 = vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc3x01234567));
+#endif
+
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min);
+ vout0x01234567_1x01234567 = vmaxq_u8(vout0x01234567_1x01234567, voutput_min);
+ vout2x01234567_3x01234567 = vmaxq_u8(vout2x01234567_3x01234567, voutput_min);
+
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max);
+ vout0x01234567_1x01234567 = vminq_u8(vout0x01234567_1x01234567, voutput_max);
+ vout2x01234567_3x01234567 = vminq_u8(vout2x01234567_3x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_u8(c0 + 0, vget_low_u8(vout0x01234567_1x01234567));
+ vst1_u8(c1 + 0, vget_high_u8(vout0x01234567_1x01234567));
+ vst1_u8(c2 + 0, vget_low_u8(vout2x01234567_3x01234567));
+ vst1_u8(c3 + 0, vget_high_u8(vout2x01234567_3x01234567));
+
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
+ c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
+
+ a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
+ a2 = (const uint8_t*) ((uintptr_t) a2 - kc);
+ a3 = (const uint8_t*) ((uintptr_t) a3 - kc);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32((void*) c2, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 0); c2 += 4;
+ vst1q_lane_u32((void*) c3, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 2); c3 += 4;
+ vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16((void*) c2, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 0); c2 += 2;
+ vst1q_lane_u16((void*) c3, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 4); c3 += 2;
+ vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
+ vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
+ vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-igemm/c4-neondot.c.in b/src/qu8-igemm/c4-neondot.c.in
index 09d7673..b09bf33 100644
--- a/src/qu8-igemm/c4-neondot.c.in
+++ b/src/qu8-igemm/c4-neondot.c.in
@@ -6,7 +6,7 @@
$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
$assert NR % 8 == 0
$assert 8 <= NR <= 32
-$assert REQUANTIZATION == "RNDNU"
+$assert REQUANTIZATION in ["FP32", "RNDNU"]
#include <assert.h>
#include <arm_neon.h>
diff --git a/src/qu8-igemm/gen/1x16c4-minmax-fp32-neondot.c b/src/qu8-igemm/gen/1x16c4-minmax-fp32-neondot.c
new file mode 100644
index 0000000..ff97e9e
--- /dev/null
+++ b/src/qu8-igemm/gen/1x16c4-minmax-fp32-neondot.c
@@ -0,0 +1,189 @@
+// Auto-generated file. Do not edit!
+// Template: src/qu8-igemm/c4-neondot.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const uint8_t** restrict a,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const uint8_t* zero,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 4 * sizeof(uint8_t));
+ uint8_t* c0 = c;
+
+ const uint8x8_t va_zero_point = vld1_dup_u8(¶ms->fp32_neonv8.kernel_zero_point[0]);
+
+ do {
+ // Initialize accumulators with bias. 16 bias values are loaded from the
+ // weight matrix, at the start of the group of 16 columns.
+ uint32x4_t vpacc0x0123 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0x4567 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0x89AB = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0xCDEF = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x2_t vnacc0 = vmov_n_u32(0);
+
+ size_t p = ks;
+ do {
+ const uint8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ // Inner accumulation loop along the 16 columns.
+ size_t k = kc;
+ // 2x partial unrolled loop to load 8 bytes at a time.
+ while (k >= 8 * sizeof(uint8_t)) {
+ // Load a 1x8 block of activations.
+ const uint8x8_t va0x01234567 = vld1_u8(a0); a0 += 8;
+
+ // Load a 8x16 block of weights.
+ const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+
+ // Multiply-accumulate: 1x8 * 8x16 --> 1x16.
+ vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb4567x0123, va0x01234567, 1);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb4567x4567, va0x01234567, 1);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb4567x89AB, va0x01234567, 1);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
+
+ k -= 8 * sizeof(uint8_t);
+ }
+ // Handle up to 4 final positions of `k`
+ if XNN_UNLIKELY(k != 0) {
+ // Load a 1x4 block of activations.
+ const uint8x8_t va0x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a0, vmov_n_u32(0), 0)); a0 += 4;
+
+ // Load a 4x16 block of weights.
+ const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+
+ // Multiply-accumulate: 1x4 * 4x16 --> 1x16.
+ vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
+ }
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+ // Subtract zero point from accumulators.
+ vnacc0 = vpadd_u32(vnacc0, vnacc0);
+ const uint32x4_t vnacc0x0123 = vcombine_u32(vnacc0, vnacc0);
+ int32x4_t vacc0x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x0123, vnacc0x0123));
+ int32x4_t vacc0x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x4567, vnacc0x0123));
+ int32x4_t vacc0x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc0x89AB, vnacc0x0123));
+ int32x4_t vacc0xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc0xCDEF, vnacc0x0123));
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc0x89AB = vcvtq_f32_s32(vacc0x89AB);
+ float32x4_t vfpacc0xCDEF = vcvtq_f32_s32(vacc0xCDEF);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc0x89AB = vmulq_f32(vfpacc0x89AB, vscale);
+ vfpacc0xCDEF = vmulq_f32(vfpacc0xCDEF, vscale);
+
+ vacc0x0123 = vcvtnq_s32_f32(vfpacc0x0123);
+ vacc0x4567 = vcvtnq_s32_f32(vfpacc0x4567);
+ vacc0x89AB = vcvtnq_s32_f32(vfpacc0x89AB);
+ vacc0xCDEF = vcvtnq_s32_f32(vfpacc0xCDEF);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
+ uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+ uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF));
+#endif
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
+
+ vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min);
+
+ vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_u8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const uint8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ uint8x8_t vout0x01234567 = vget_low_u8(vout0x0123456789ABCDEF);
+ if (nc & 8) {
+ vst1_u8(c0, vout0x01234567); c0 += 8; // This line
+ vout0x01234567 = vget_high_u8(vout0x0123456789ABCDEF);
+ }
+ if (nc & 4) {
+ vst1_lane_u32((void*) c0, vreinterpret_u32_u8(vout0x01234567), 0); c0 += 4;
+ vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16((void*) c0, vreinterpret_u16_u8(vout0x01234567), 0); c0 += 2;
+ vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_u8(c0, vout0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-igemm/gen/1x8-minmax-fp32-neon-mlal-lane.c b/src/qu8-igemm/gen/1x8-minmax-fp32-neon-mlal-lane.c
new file mode 100644
index 0000000..d3aeb32
--- /dev/null
+++ b/src/qu8-igemm/gen/1x8-minmax-fp32-neon-mlal-lane.c
@@ -0,0 +1,227 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/neon-mlal-lane.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/igemm.h>
+
+
+void xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const uint8_t** restrict a,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const uint8_t* zero,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ uint8_t* c0 = c;
+
+ const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->fp32_neon.kernel_zero_point[0]);
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
+
+ size_t p = ks;
+ do {
+ const uint8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = kc;
+ while (k >= 8 * sizeof(uint8_t)) {
+ const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
+ const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
+
+ const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+
+
+ const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+
+ k -= 8 * sizeof(uint8_t);
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
+ const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
+
+ const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+
+ if (k >= 2 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+
+ if (k > 2 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+
+ if (k >= 4 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+
+ if (k > 4 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+
+ if (k >= 6 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+
+ if (k > 6 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+ // Post-accumulation work
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
+ vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));
+
+ const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
+ vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point);
+ vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point);
+
+#if XNN_ARCH_ARM64
+ int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
+
+
+ uint8x8_t vout0x01234567 = vqmovun_s16(vacc0x01234567);
+#else
+ int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
+
+
+ uint8x8_t vout0x01234567 = vqmovun_s16(vacc0x01234567);
+#endif
+
+ const uint8x8_t voutput_min = vld1_dup_u8(¶ms->fp32_neon.output_min);
+ vout0x01234567 = vmax_u8(vout0x01234567, voutput_min);
+
+ const uint8x8_t voutput_max = vld1_dup_u8(¶ms->fp32_neon.output_max);
+ vout0x01234567 = vmin_u8(vout0x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_u8(c0 + 0, vout0x01234567);
+
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const uint8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1_lane_u32((void*) c0, vreinterpret_u32_u8(vout0x01234567), 0); c0 += 4;
+ vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16((void*) c0, vreinterpret_u16_u8(vout0x01234567), 0); c0 += 2;
+ vout0x01234567 = vext_u8(vout0x01234567, vout0x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_u8(c0, vout0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-igemm/gen/2x16c4-minmax-fp32-neondot.c b/src/qu8-igemm/gen/2x16c4-minmax-fp32-neondot.c
new file mode 100644
index 0000000..e63118b
--- /dev/null
+++ b/src/qu8-igemm/gen/2x16c4-minmax-fp32-neondot.c
@@ -0,0 +1,250 @@
+// Auto-generated file. Do not edit!
+// Template: src/qu8-igemm/c4-neondot.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const uint8_t** restrict a,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const uint8_t* zero,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 4 * sizeof(uint8_t));
+ uint8_t* c0 = c;
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ c1 = c0;
+ }
+
+ const uint8x8_t va_zero_point = vld1_dup_u8(¶ms->fp32_neonv8.kernel_zero_point[0]);
+
+ do {
+ // Initialize accumulators with bias. 16 bias values are loaded from the
+ // weight matrix, at the start of the group of 16 columns.
+ uint32x4_t vpacc0x0123 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0x4567 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0x89AB = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0xCDEF = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc1x0123 = vpacc0x0123;
+ uint32x4_t vpacc1x4567 = vpacc0x4567;
+ uint32x4_t vpacc1x89AB = vpacc0x89AB;
+ uint32x4_t vpacc1xCDEF = vpacc0xCDEF;
+ uint32x2_t vnacc0 = vmov_n_u32(0);
+ uint32x2_t vnacc1 = vmov_n_u32(0);
+
+ size_t p = ks;
+ do {
+ const uint8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const uint8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ a += 2;
+
+ // Inner accumulation loop along the 16 columns.
+ size_t k = kc;
+ // 2x partial unrolled loop to load 8 bytes at a time.
+ while (k >= 8 * sizeof(uint8_t)) {
+ // Load a 2x8 block of activations.
+ const uint8x8_t va0x01234567 = vld1_u8(a0); a0 += 8;
+ const uint8x8_t va1x01234567 = vld1_u8(a1); a1 += 8;
+
+ // Load a 8x16 block of weights.
+ const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+
+ // Multiply-accumulate: 2x8 * 8x16 --> 2x16.
+ vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb4567x0123, va0x01234567, 1);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb4567x4567, va0x01234567, 1);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb4567x89AB, va0x01234567, 1);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
+ vnacc1 = vdot_u32(vnacc1, va_zero_point, va1x01234567);
+ vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb0123x0123, va1x01234567, 0);
+ vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb0123x4567, va1x01234567, 0);
+ vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb0123x89AB, va1x01234567, 0);
+ vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb0123xCDEF, va1x01234567, 0);
+ vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb4567x0123, va1x01234567, 1);
+ vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb4567x4567, va1x01234567, 1);
+ vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb4567x89AB, va1x01234567, 1);
+ vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
+
+ k -= 8 * sizeof(uint8_t);
+ }
+ // Handle up to 4 final positions of `k`
+ if XNN_UNLIKELY(k != 0) {
+ // Load a 2x4 block of activations.
+ const uint8x8_t va0x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a0, vmov_n_u32(0), 0)); a0 += 4;
+ const uint8x8_t va1x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a1, vmov_n_u32(0), 0)); a1 += 4;
+
+ // Load a 4x16 block of weights.
+ const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+
+ // Multiply-accumulate: 2x4 * 4x16 --> 2x16.
+ vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
+ vnacc1 = vdot_u32(vnacc1, va_zero_point, va1x01234567);
+ vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb0123x0123, va1x01234567, 0);
+ vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb0123x4567, va1x01234567, 0);
+ vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb0123x89AB, va1x01234567, 0);
+ vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb0123xCDEF, va1x01234567, 0);
+ }
+ p -= 2 * sizeof(void*);
+ } while (p != 0);
+
+ // Subtract zero point from accumulators.
+ vnacc0 = vpadd_u32(vnacc0, vnacc0);
+ const uint32x4_t vnacc0x0123 = vcombine_u32(vnacc0, vnacc0);
+ int32x4_t vacc0x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x0123, vnacc0x0123));
+ int32x4_t vacc0x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x4567, vnacc0x0123));
+ int32x4_t vacc0x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc0x89AB, vnacc0x0123));
+ int32x4_t vacc0xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc0xCDEF, vnacc0x0123));
+ vnacc1 = vpadd_u32(vnacc1, vnacc1);
+ const uint32x4_t vnacc1x0123 = vcombine_u32(vnacc1, vnacc1);
+ int32x4_t vacc1x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc1x0123, vnacc1x0123));
+ int32x4_t vacc1x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc1x4567, vnacc1x0123));
+ int32x4_t vacc1x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc1x89AB, vnacc1x0123));
+ int32x4_t vacc1xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc1xCDEF, vnacc1x0123));
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc0x89AB = vcvtq_f32_s32(vacc0x89AB);
+ float32x4_t vfpacc0xCDEF = vcvtq_f32_s32(vacc0xCDEF);
+ float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
+ float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);
+ float32x4_t vfpacc1x89AB = vcvtq_f32_s32(vacc1x89AB);
+ float32x4_t vfpacc1xCDEF = vcvtq_f32_s32(vacc1xCDEF);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc0x89AB = vmulq_f32(vfpacc0x89AB, vscale);
+ vfpacc0xCDEF = vmulq_f32(vfpacc0xCDEF, vscale);
+ vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale);
+ vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale);
+ vfpacc1x89AB = vmulq_f32(vfpacc1x89AB, vscale);
+ vfpacc1xCDEF = vmulq_f32(vfpacc1xCDEF, vscale);
+
+ vacc0x0123 = vcvtnq_s32_f32(vfpacc0x0123);
+ vacc0x4567 = vcvtnq_s32_f32(vfpacc0x4567);
+ vacc0x89AB = vcvtnq_s32_f32(vfpacc0x89AB);
+ vacc0xCDEF = vcvtnq_s32_f32(vfpacc0xCDEF);
+ vacc1x0123 = vcvtnq_s32_f32(vfpacc1x0123);
+ vacc1x4567 = vcvtnq_s32_f32(vfpacc1x4567);
+ vacc1x89AB = vcvtnq_s32_f32(vfpacc1x89AB);
+ vacc1xCDEF = vcvtnq_s32_f32(vfpacc1xCDEF);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+
+ uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF);
+ uint8x16_t vout1x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+ uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF));
+ uint8x16_t vout1x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc1x01234567), vqmovun_s16(vacc1x89ABCDEF));
+#endif
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
+
+ vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min);
+ vout1x0123456789ABCDEF = vmaxq_u8(vout1x0123456789ABCDEF, voutput_min);
+
+ vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max);
+ vout1x0123456789ABCDEF = vminq_u8(vout1x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_u8(c1 + 0, vout1x0123456789ABCDEF);
+ vst1q_u8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const uint8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vget_low_u8(vout0x0123456789ABCDEF), vget_low_u8(vout1x0123456789ABCDEF));
+ if (nc & 8) {
+ vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567)); c1 += 8;
+ vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567)); c0 += 8;
+ vout0x01234567_1x01234567 = vcombine_u8(vget_high_u8(vout0x0123456789ABCDEF), vget_high_u8(vout1x0123456789ABCDEF));
+ }
+ if (nc & 4) {
+ vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S b/src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
new file mode 100644
index 0000000..c0e1b86
--- /dev/null
+++ b/src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-cortex-a55.S
@@ -0,0 +1,742 @@
+// Auto-generated file. Do not edit!
+// Template: src/qu8-igemm/4x16c4-aarch64-neondot-cortex-a55.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t**restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> (x0)
+# size_t a_offset, [sp + 8] -> x8
+# const int8_t* zero, [sp + 16] -> x12
+# const union xnn_qu8_conv_minmax_params params) [sp + 24] -> (x11)
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0 v4
+# A1 x14 v1 v5
+# A2 x15 v2 v6
+# A3 x10 v3 (v0)
+# B x5 v8 v9 v10 v11
+# C0 x6 v16 v20 v24 v28
+# C1 x16 v17 v21 v25 v29
+# C2 x17 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# zero point v7 v12 v13 v14 v15
+
+# x11 temp for Cortex-A55 loads
+
+BEGIN_FUNCTION xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDR x8, [sp, 8] // Load a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+ LDP x12, x11, [sp, 16] // Load zero pointer, params
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ # Save d8-d15 to stack
+ STP d8, d9, [sp, -64]!
+
+ CSEL x17, x16, x17, LS // c2 = c1
+ BIC x2, x2, 3
+ STP d10, d11, [sp, 16]
+ CMP x0, 4 // if mr < 4
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ STP d12, d13, [sp, 32]
+ CSEL x7, x17, x7, LO // c3 = c2
+ LD1R {v7.4s}, [x11], 4 // kernel_zero_point
+ STP d14, d15, [sp, 48]
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+
+ MOVI v12.4s, 0
+ MOVI v13.4s, 0
+ MOVI v14.4s, 0
+ MOVI v15.4s, 0
+
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x10, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x8 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x8 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x8 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x10, x12 // if a3 == zero
+ ADD x10, x10, x8 // a3 += a_offset
+ CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 16 bytes for prologue/epilogue?
+ SUBS x0, x2, 16 // k = kc - 16
+ B.LO 5f
+
+ # prologue - read A and B values for block 0 and 1
+ LDR q8, [x5], 16
+ LDR d0, [x13], 8
+ LDR d1, [x14], 8
+ LDR d2, [x15], 8
+ LDR d3, [x10], 8
+ SUBS x0, x0, 16 // is there 16 for main loop?
+ LDR d9, [x5], 8
+ LDR x11, [x5], 8
+ # Is there at least 16 bytes for main loop?
+ B.LO 3f
+
+ # Main loop - 16 bytes of A in 4 groups.
+ # 4 row of 4 vectors wide = 16 UDOT instructions for 4 channels
+ # 4 LD64 for A
+ # 4 LD128 for W. = 2 LD64 + INS.
+ # for each 4 UDOT, 1 LD64 for A, 2 LD64 for W + INS.
+
+ .p2align 3
+2:
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v0.4b[0]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v1.4b[0]
+ INS v9.d[1], x11
+ UDOT v18.4s, v8.16b, v2.4b[0]
+ LDR x11, [x5], 8
+ UDOT v19.4s, v8.16b, v3.4b[0]
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v0.4b[0]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v1.4b[0]
+ INS v10.d[1], x11
+ UDOT v22.4s, v9.16b, v2.4b[0]
+ LDR x11, [x5], 8
+ UDOT v23.4s, v9.16b, v3.4b[0]
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v0.4b[0]
+ LDR d8, [x5], 8
+ UDOT v25.4s, v10.16b, v1.4b[0]
+ INS v11.d[1], x11
+ UDOT v26.4s, v10.16b, v2.4b[0]
+ LDR x11, [x5], 8
+ UDOT v27.4s, v10.16b, v3.4b[0]
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v0.4b[0]
+ LDR d9, [x5], 8
+ UDOT v29.4s, v11.16b, v1.4b[0]
+ INS v8.d[1], x11
+ UDOT v30.4s, v11.16b, v2.4b[0]
+ LDR x11, [x5], 8
+ UDOT v31.4s, v11.16b, v3.4b[0]
+
+ UDOT v12.2s, v7.8b, v0.8b
+ UDOT v13.2s, v7.8b, v1.8b
+ UDOT v14.2s, v7.8b, v2.8b
+ UDOT v15.2s, v7.8b, v3.8b
+
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v0.4b[1]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v1.4b[1]
+ INS v9.d[1], x11
+ UDOT v18.4s, v8.16b, v2.4b[1]
+ LDR x11, [x5], 8
+ UDOT v19.4s, v8.16b, v3.4b[1]
+ LDR d4, [x13], 8
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v0.4b[1]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v1.4b[1]
+ INS v10.d[1], x11
+ UDOT v22.4s, v9.16b, v2.4b[1]
+ LDR x11, [x5], 8
+ UDOT v23.4s, v9.16b, v3.4b[1]
+ LDR d5, [x14], 8
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v0.4b[1]
+ LDR d8, [x5], 8
+ UDOT v25.4s, v10.16b, v1.4b[1]
+ INS v11.d[1], x11
+ UDOT v26.4s, v10.16b, v2.4b[1]
+ LDR x11, [x5], 8
+ UDOT v27.4s, v10.16b, v3.4b[1]
+ LDR d6, [x15], 8
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v0.4b[1]
+ LDR d9, [x5], 8
+ UDOT v29.4s, v11.16b, v1.4b[1]
+ INS v8.d[1], x11
+ UDOT v30.4s, v11.16b, v2.4b[1]
+ LDR x11, [x5], 8
+ UDOT v31.4s, v11.16b, v3.4b[1]
+ LDR d0, [x10], 8
+
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v4.4b[0]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v5.4b[0]
+ INS v9.d[1], x11
+ UDOT v18.4s, v8.16b, v6.4b[0]
+ LDR x11, [x5], 8
+ UDOT v19.4s, v8.16b, v0.4b[0]
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v4.4b[0]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v5.4b[0]
+ INS v10.d[1], x11
+ UDOT v22.4s, v9.16b, v6.4b[0]
+ LDR x11, [x5], 8
+ UDOT v23.4s, v9.16b, v0.4b[0]
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v4.4b[0]
+ LDR d8, [x5], 8
+ UDOT v25.4s, v10.16b, v5.4b[0]
+ INS v11.d[1], x11
+ UDOT v26.4s, v10.16b, v6.4b[0]
+ LDR x11, [x5], 8
+ UDOT v27.4s, v10.16b, v0.4b[0]
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v4.4b[0]
+ LDR d9, [x5], 8
+ UDOT v29.4s, v11.16b, v5.4b[0]
+ INS v8.d[1], x11
+ UDOT v30.4s, v11.16b, v6.4b[0]
+ LDR x11, [x5], 8
+ UDOT v31.4s, v11.16b, v0.4b[0]
+
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v4.4b[1]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v5.4b[1]
+ INS v9.d[1], x11
+ UDOT v18.4s, v8.16b, v6.4b[1]
+ LDR x11, [x5], 8
+ UDOT v19.4s, v8.16b, v0.4b[1]
+ LDR d1, [x14], 8
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v4.4b[1]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v5.4b[1]
+ INS v10.d[1], x11
+ UDOT v22.4s, v9.16b, v6.4b[1]
+ LDR x11, [x5], 8
+ UDOT v23.4s, v9.16b, v0.4b[1]
+ LDR d2, [x15], 8
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v4.4b[1]
+ LDR d8, [x5], 8 // First B values for block 0 and 1
+ UDOT v25.4s, v10.16b, v5.4b[1]
+ INS v11.d[1], x11
+ UDOT v26.4s, v10.16b, v6.4b[1]
+ LDR x11, [x5], 8
+ UDOT v27.4s, v10.16b, v0.4b[1]
+ LDR d3, [x10], 8
+
+ # BLOCK 3 special
+ UDOT v31.4s, v11.16b, v0.4b[1]
+ LDR d9, [x5], 8
+ UDOT v15.2s, v7.8b, v0.8b // free up v0 early
+ INS v8.d[1], x11
+ UDOT v28.4s, v11.16b, v4.4b[1]
+ LDR x11, [x5], 8
+ UDOT v29.4s, v11.16b, v5.4b[1]
+ LDR d0, [x13], 8
+ UDOT v30.4s, v11.16b, v6.4b[1]
+ SUBS x0, x0, 16
+
+ UDOT v12.2s, v7.8b, v4.8b
+ UDOT v13.2s, v7.8b, v5.8b
+ UDOT v14.2s, v7.8b, v6.8b
+ B.HS 2b
+
+ # Epilogue. Same as main loop but no preloads in final group
+3:
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v0.4b[0]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v1.4b[0]
+ INS v9.d[1], x11
+ UDOT v18.4s, v8.16b, v2.4b[0]
+ LDR x11, [x5], 8
+ UDOT v19.4s, v8.16b, v3.4b[0]
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v0.4b[0]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v1.4b[0]
+ INS v10.d[1], x11
+ UDOT v22.4s, v9.16b, v2.4b[0]
+ LDR x11, [x5], 8
+ UDOT v23.4s, v9.16b, v3.4b[0]
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v0.4b[0]
+ LDR d8, [x5], 8
+ UDOT v25.4s, v10.16b, v1.4b[0]
+ INS v11.d[1], x11
+ UDOT v26.4s, v10.16b, v2.4b[0]
+ LDR x11, [x5], 8
+ UDOT v27.4s, v10.16b, v3.4b[0]
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v0.4b[0]
+ LDR d9, [x5], 8
+ UDOT v29.4s, v11.16b, v1.4b[0]
+ INS v8.d[1], x11
+ UDOT v30.4s, v11.16b, v2.4b[0]
+ LDR x11, [x5], 8
+ UDOT v31.4s, v11.16b, v3.4b[0]
+
+ UDOT v12.2s, v7.8b, v0.8b
+ UDOT v13.2s, v7.8b, v1.8b
+ UDOT v14.2s, v7.8b, v2.8b
+ UDOT v15.2s, v7.8b, v3.8b
+
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v0.4b[1]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v1.4b[1]
+ INS v9.d[1], x11
+ UDOT v18.4s, v8.16b, v2.4b[1]
+ LDR x11, [x5], 8
+ UDOT v19.4s, v8.16b, v3.4b[1]
+ LDR d4, [x13], 8
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v0.4b[1]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v1.4b[1]
+ INS v10.d[1], x11
+ UDOT v22.4s, v9.16b, v2.4b[1]
+ LDR x11, [x5], 8
+ UDOT v23.4s, v9.16b, v3.4b[1]
+ LDR d5, [x14], 8
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v0.4b[1]
+ LDR d8, [x5], 8
+ UDOT v25.4s, v10.16b, v1.4b[1]
+ INS v11.d[1], x11
+ UDOT v26.4s, v10.16b, v2.4b[1]
+ LDR x11, [x5], 8
+ UDOT v27.4s, v10.16b, v3.4b[1]
+ LDR d6, [x15], 8
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v0.4b[1]
+ LDR d9, [x5], 8
+ UDOT v29.4s, v11.16b, v1.4b[1]
+ INS v8.d[1], x11
+ UDOT v30.4s, v11.16b, v2.4b[1]
+ LDR x11, [x5], 8
+ UDOT v31.4s, v11.16b, v3.4b[1]
+ LDR d0, [x10], 8
+
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v4.4b[0]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v5.4b[0]
+ INS v9.d[1], x11
+ UDOT v18.4s, v8.16b, v6.4b[0]
+ LDR x11, [x5], 8
+ UDOT v19.4s, v8.16b, v0.4b[0]
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v4.4b[0]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v5.4b[0]
+ INS v10.d[1], x11
+ UDOT v22.4s, v9.16b, v6.4b[0]
+ LDR x11, [x5], 8
+ UDOT v23.4s, v9.16b, v0.4b[0]
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v4.4b[0]
+ LDR d8, [x5], 8
+ UDOT v25.4s, v10.16b, v5.4b[0]
+ INS v11.d[1], x11
+ UDOT v26.4s, v10.16b, v6.4b[0]
+ LDR x11, [x5], 8
+ UDOT v27.4s, v10.16b, v0.4b[0]
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v4.4b[0]
+ LDR d9, [x5], 8
+ UDOT v29.4s, v11.16b, v5.4b[0]
+ INS v8.d[1], x11
+ UDOT v30.4s, v11.16b, v6.4b[0]
+ LDR x11, [x5], 8
+ UDOT v31.4s, v11.16b, v0.4b[0]
+
+ # BLOCK 0
+ UDOT v16.4s, v8.16b, v4.4b[1]
+ LDR d10, [x5], 8
+ UDOT v17.4s, v8.16b, v5.4b[1]
+ INS v9.d[1], x11
+ UDOT v18.4s, v8.16b, v6.4b[1]
+ LDR x11, [x5], 8
+ UDOT v19.4s, v8.16b, v0.4b[1]
+
+ # BLOCK 1
+ UDOT v20.4s, v9.16b, v4.4b[1]
+ LDR d11, [x5], 8
+ UDOT v21.4s, v9.16b, v5.4b[1]
+ INS v10.d[1], x11
+ UDOT v22.4s, v9.16b, v6.4b[1]
+ LDR x11, [x5], 8
+ UDOT v23.4s, v9.16b, v0.4b[1]
+
+ # BLOCK 2
+ UDOT v24.4s, v10.16b, v4.4b[1]
+ UDOT v25.4s, v10.16b, v5.4b[1]
+ INS v11.d[1], x11
+ UDOT v26.4s, v10.16b, v6.4b[1]
+ UDOT v27.4s, v10.16b, v0.4b[1]
+
+ # BLOCK 3
+ UDOT v28.4s, v11.16b, v4.4b[1]
+ UDOT v29.4s, v11.16b, v5.4b[1]
+ UDOT v30.4s, v11.16b, v6.4b[1]
+ UDOT v31.4s, v11.16b, v0.4b[1]
+
+ UDOT v12.2s, v7.8b, v4.8b
+ UDOT v13.2s, v7.8b, v5.8b
+ UDOT v14.2s, v7.8b, v6.8b
+ UDOT v15.2s, v7.8b, v0.8b
+
+ # Is there a remainder?- 4 to 12 bytes of A
+ TST x0, 15
+ B.NE 5f
+
+4:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+ ADDP v0.2s, v12.2s, v13.2s
+ ADDP v1.2s, v14.2s, v15.2s
+ LDR x11, [sp, 88] // Reload params
+ DUP v12.4s, v0.s[0]
+ DUP v13.4s, v0.s[1]
+ DUP v14.4s, v1.s[0]
+ DUP v15.4s, v1.s[1]
+ ADD x11, x11, 4
+
+ # Subtract zero point from accumulators
+ SUB v16.4s, v16.4s, v12.4s
+ SUB v17.4s, v17.4s, v13.4s
+ SUB v18.4s, v18.4s, v14.4s
+ SUB v19.4s, v19.4s, v15.4s
+ SUB v20.4s, v20.4s, v12.4s
+ SUB v21.4s, v21.4s, v13.4s
+ SUB v22.4s, v22.4s, v14.4s
+ SUB v23.4s, v23.4s, v15.4s
+ SUB v24.4s, v24.4s, v12.4s
+ SUB v25.4s, v25.4s, v13.4s
+ SUB v26.4s, v26.4s, v14.4s
+ SUB v27.4s, v27.4s, v15.4s
+ SUB v28.4s, v28.4s, v12.4s
+ SUB v29.4s, v29.4s, v13.4s
+ SUB v30.4s, v30.4s, v14.4s
+ SUB v31.4s, v31.4s, v15.4s
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ # Apply params - scale, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v4.4s
+ FMUL v21.4s, v21.4s, v4.4s
+ FMUL v22.4s, v22.4s, v4.4s
+ FMUL v23.4s, v23.4s, v4.4s
+ FMUL v24.4s, v24.4s, v4.4s
+ FMUL v25.4s, v25.4s, v4.4s
+ FMUL v26.4s, v26.4s, v4.4s
+ FMUL v27.4s, v27.4s, v4.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTUN v0.8b, v16.8h
+ SQXTUN v1.8b, v17.8h
+ SQXTUN v2.8b, v18.8h
+ SQXTUN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTUN2 v0.16b, v24.8h
+ SQXTUN2 v1.16b, v25.8h
+ SQXTUN2 v2.16b, v26.8h
+ SQXTUN2 v3.16b, v27.8h
+ LDR x0, [sp, 64] // Load cn_stride
+
+ UMAX v0.16b, v0.16b, v4.16b
+ UMAX v1.16b, v1.16b, v4.16b
+ UMAX v2.16b, v2.16b, v4.16b
+ UMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ UMIN v0.16b, v0.16b, v5.16b
+ UMIN v1.16b, v1.16b, v5.16b
+ UMIN v2.16b, v2.16b, v5.16b
+ UMIN v3.16b, v3.16b, v5.16b
+ B.LO 7f
+
+ # Store full 4 x 16
+ ST1 {v3.16b}, [x7], x0
+ ST1 {v2.16b}, [x17], x0
+ ST1 {v1.16b}, [x16], x0
+ ST1 {v0.16b}, [x6], x0
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 64
+ RET
+
+ # Remainder- 4 to 12 bytes of A
+ .p2align 3
+5:
+ TBZ x0, 3, 6f
+
+ LDR d0, [x13], 8
+ LDP q8, q9, [x5], 32
+ LDR d1, [x14], 8
+ LDR d2, [x15], 8
+ LDR d3, [x10], 8
+ LDP q10, q11, [x5], 32
+ UDOT v12.2s, v7.8b, v0.8b
+ UDOT v13.2s, v7.8b, v1.8b
+ UDOT v14.2s, v7.8b, v2.8b
+ UDOT v15.2s, v7.8b, v3.8b
+ UDOT v16.4s, v8.16b, v0.4b[0]
+ UDOT v17.4s, v8.16b, v1.4b[0]
+ UDOT v18.4s, v8.16b, v2.4b[0]
+ UDOT v19.4s, v8.16b, v3.4b[0]
+ UDOT v20.4s, v9.16b, v0.4b[0]
+ UDOT v21.4s, v9.16b, v1.4b[0]
+ UDOT v22.4s, v9.16b, v2.4b[0]
+ UDOT v23.4s, v9.16b, v3.4b[0]
+ UDOT v24.4s, v10.16b, v0.4b[0]
+ UDOT v25.4s, v10.16b, v1.4b[0]
+ UDOT v26.4s, v10.16b, v2.4b[0]
+ UDOT v27.4s, v10.16b, v3.4b[0]
+ UDOT v28.4s, v11.16b, v0.4b[0]
+ UDOT v29.4s, v11.16b, v1.4b[0]
+ UDOT v30.4s, v11.16b, v2.4b[0]
+ UDOT v31.4s, v11.16b, v3.4b[0]
+ LDP q8, q9, [x5], 32
+ LDP q10, q11, [x5], 32
+ UDOT v16.4s, v8.16b, v0.4b[1]
+ UDOT v17.4s, v8.16b, v1.4b[1]
+ UDOT v18.4s, v8.16b, v2.4b[1]
+ UDOT v19.4s, v8.16b, v3.4b[1]
+ UDOT v20.4s, v9.16b, v0.4b[1]
+ UDOT v21.4s, v9.16b, v1.4b[1]
+ UDOT v22.4s, v9.16b, v2.4b[1]
+ UDOT v23.4s, v9.16b, v3.4b[1]
+ UDOT v24.4s, v10.16b, v0.4b[1]
+ UDOT v25.4s, v10.16b, v1.4b[1]
+ UDOT v26.4s, v10.16b, v2.4b[1]
+ UDOT v27.4s, v10.16b, v3.4b[1]
+ UDOT v28.4s, v11.16b, v0.4b[1]
+ UDOT v29.4s, v11.16b, v1.4b[1]
+ UDOT v30.4s, v11.16b, v2.4b[1]
+ UDOT v31.4s, v11.16b, v3.4b[1]
+ TBZ x0, 2, 4b
+6:
+ LDR s0, [x13], 4
+ LDP q8, q9, [x5], 32
+ LDR s1, [x14], 4
+ LDR s2, [x15], 4
+ LDR s3, [x10], 4
+ LDP q10, q11, [x5], 32
+ UDOT v12.2s, v7.8b, v0.8b
+ UDOT v13.2s, v7.8b, v1.8b
+ UDOT v14.2s, v7.8b, v2.8b
+ UDOT v15.2s, v7.8b, v3.8b
+ UDOT v16.4s, v8.16b, v0.4b[0]
+ UDOT v17.4s, v8.16b, v1.4b[0]
+ UDOT v18.4s, v8.16b, v2.4b[0]
+ UDOT v19.4s, v8.16b, v3.4b[0]
+ UDOT v20.4s, v9.16b, v0.4b[0]
+ UDOT v21.4s, v9.16b, v1.4b[0]
+ UDOT v22.4s, v9.16b, v2.4b[0]
+ UDOT v23.4s, v9.16b, v3.4b[0]
+ UDOT v24.4s, v10.16b, v0.4b[0]
+ UDOT v25.4s, v10.16b, v1.4b[0]
+ UDOT v26.4s, v10.16b, v2.4b[0]
+ UDOT v27.4s, v10.16b, v3.4b[0]
+ UDOT v28.4s, v11.16b, v0.4b[0]
+ UDOT v29.4s, v11.16b, v1.4b[0]
+ UDOT v30.4s, v11.16b, v2.4b[0]
+ UDOT v31.4s, v11.16b, v3.4b[0]
+ B 4b
+
+ # Store odd width
+ .p2align 3
+7:
+ TBZ x1, 3, 8f
+ STR d3, [x7], 8
+ STR d2, [x17], 8
+ DUP d3, v3.d[1]
+ DUP d2, v2.d[1]
+ STR d1, [x16], 8
+ STR d0, [x6], 8
+ DUP d1, v1.d[1]
+ DUP d0, v0.d[1]
+8:
+ TBZ x1, 2, 9f
+ STR s3, [x7], 4
+ STR s2, [x17], 4
+ DUP s3, v3.s[1]
+ DUP s2, v2.s[1]
+ STR s1, [x16], 4
+ STR s0, [x6], 4
+ DUP s1, v1.s[1]
+ DUP s0, v0.s[1]
+9:
+ TBZ x1, 1, 10f
+ STR h3, [x7], 2
+ STR h2, [x17], 2
+ DUP h3, v3.h[1]
+ DUP h2, v2.h[1]
+ STR h1, [x16], 2
+ STR h0, [x6], 2
+ DUP h1, v1.h[1]
+ DUP h0, v0.h[1]
+10:
+ TBZ x1, 0, 11f
+ STR b3, [x7]
+ STR b2, [x17]
+ STR b1, [x16]
+ STR b0, [x6]
+11:
+ # Restore d8-d15 from stack
+ LDP d14, d15, [sp, 48]
+ LDP d12, d13, [sp, 32]
+ LDP d10, d11, [sp, 16]
+ LDP d8, d9, [sp], 64
+ RET
+
+END_FUNCTION xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S b/src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
new file mode 100644
index 0000000..2ffe811
--- /dev/null
+++ b/src/qu8-igemm/gen/4x16c4-minmax-fp32-aarch64-neondot-ld128.S
@@ -0,0 +1,501 @@
+// Auto-generated file. Do not edit!
+// Template: src/qu8-igemm/4x16c4-aarch64-neondot-ld128.S.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+
+#include <xnnpack/assembly.h>
+
+# void xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128(
+# size_t mr, x0
+# size_t nc, x1
+# size_t kc, x2 / x0
+# size_t ks, x3 / x9
+# const int8_t**restrict a, x4
+# const int8_t* restrict w, x5
+# int8_t* restrict c, x6
+# size_t cm_stride, x7
+# size_t cn_stride, [sp] -> (x0)
+# size_t a_offset, [sp + 8] -> x8
+# const int8_t* zero, [sp + 16] -> x12
+# const union xnn_qu8_conv_minmax_params params) [sp + 24] -> x11
+
+# d8-d15, x19-x30 need to be preserved if used. x18 is reserved by the OS.
+
+# Register usage
+# A0 x13 v0
+# A1 x14 v1
+# A2 x15 v2
+# A3 x10 v3
+# B x5 v4 v5 v6 v7
+# C0 x6 v16 v20 v24 v28
+# C1 x16 v17 v21 v25 v29
+# C2 x17 v18 v22 v26 v30
+# C3 x7 v19 v23 v27 v31
+# zero_point v8 v12 v13 v14 v15
+# unused v9 v10 v11
+
+BEGIN_FUNCTION xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
+
+ # Clamp C pointers
+ CMP x0, 2 // if mr < 2
+ LDR x8, [sp, 8] // Load a_offset
+ ADD x16, x6, x7 // c1 = c0 + cm_stride
+ CSEL x16, x6, x16, LO // c1 = c0
+ LDP x12, x11, [sp, 16] // Load zero pointer, params
+ ADD x2, x2, 3 // kc = (kc + 3) & ~3
+ ADD x17, x16, x7 // c2 = c1 + cm_stride
+ // if mr <= 2
+ # Save d8,d12-d15 on stack
+ STR d8, [sp, -48]!
+ CSEL x17, x16, x17, LS // c2 = c1
+ BIC x2, x2, 3
+ STP d12, d13, [sp, 16]
+ CMP x0, 4 // if mr < 4
+ ADD x7, x17, x7 // c3 = c2 + cm_stride
+ STP d14, d15, [sp, 32]
+ CSEL x7, x17, x7, LO // c3 = c2
+ LD1R {v8.4s}, [x11], 4 // kernel_zero_point
+
+ .p2align 3
+0:
+ # Load initial bias from w into accumulators
+ LDP q16, q20, [x5], 32
+
+ MOVI v12.4s, 0
+ MOVI v13.4s, 0
+ MOVI v14.4s, 0
+ MOVI v15.4s, 0
+
+ MOV v17.16b, v16.16b
+ MOV v18.16b, v16.16b
+ LDP q24, q28, [x5], 32
+ MOV v19.16b, v16.16b
+ MOV v21.16b, v20.16b
+ MOV v22.16b, v20.16b
+ MOV v23.16b, v20.16b
+ MOV v25.16b, v24.16b
+ MOV v26.16b, v24.16b
+ MOV v27.16b, v24.16b
+ MOV v29.16b, v28.16b
+ MOV v30.16b, v28.16b
+ MOV v31.16b, v28.16b
+
+ MOV x9, x3 // p = ks
+
+ .p2align 3
+1:
+ # Load next 4 A pointers
+ LDP x13, x14, [x4], 16
+ LDP x15, x10, [x4], 16
+
+ CMP x13, x12 // if a0 == zero
+ ADD x13, x13, x8 // a0 += a_offset
+ CSEL x13, x12, x13, EQ // a0 = zero, else += a0 + a_offset
+ CMP x14, x12 // if a1 == zero
+ ADD x14, x14, x8 // a1 += a_offset
+ CSEL x14, x12, x14, EQ // a1 = zero, else += a1 + a_offset
+ CMP x15, x12 // if a2 == zero
+ ADD x15, x15, x8 // a2 += a_offset
+ CSEL x15, x12, x15, EQ // a2 = zero, else += a2 + a_offset
+ CMP x10, x12 // if a3 == zero
+ ADD x10, x10, x8 // a3 += a_offset
+ CSEL x10, x12, x10, EQ // a3 = zero, else += a3 + a_offset
+
+ # Is there at least 16 bytes for main loop?
+ SUBS x0, x2, 16 // k = kc - 16
+ B.LO 4f
+
+ # Main loop - 16 bytes of A
+ .p2align 3
+2:
+ LDR q0, [x13], 16
+ LDR q4, [x5], 16
+ LDR q1, [x14], 16
+ LDR q2, [x15], 16
+ LDR q3, [x10], 16
+ LDR q5, [x5], 16
+
+ UDOT v12.4s, v8.16b, v0.16b // update zero point
+ UDOT v13.4s, v8.16b, v1.16b
+ UDOT v14.4s, v8.16b, v2.16b
+ UDOT v15.4s, v8.16b, v3.16b
+
+ UDOT v16.4s, v4.16b, v0.4b[0]
+ UDOT v17.4s, v4.16b, v1.4b[0]
+ LDP q6, q7, [x5], 32
+ UDOT v18.4s, v4.16b, v2.4b[0]
+ UDOT v19.4s, v4.16b, v3.4b[0]
+ UDOT v20.4s, v5.16b, v0.4b[0]
+ UDOT v21.4s, v5.16b, v1.4b[0]
+ UDOT v22.4s, v5.16b, v2.4b[0]
+ UDOT v23.4s, v5.16b, v3.4b[0]
+ UDOT v24.4s, v6.16b, v0.4b[0]
+ UDOT v25.4s, v6.16b, v1.4b[0]
+ LDP q4, q5, [x5], 32
+ UDOT v26.4s, v6.16b, v2.4b[0]
+ UDOT v27.4s, v6.16b, v3.4b[0]
+ UDOT v28.4s, v7.16b, v0.4b[0]
+ UDOT v29.4s, v7.16b, v1.4b[0]
+ UDOT v30.4s, v7.16b, v2.4b[0]
+ UDOT v31.4s, v7.16b, v3.4b[0]
+
+ UDOT v16.4s, v4.16b, v0.4b[1]
+ UDOT v17.4s, v4.16b, v1.4b[1]
+ LDP q6, q7, [x5], 32
+ UDOT v18.4s, v4.16b, v2.4b[1]
+ UDOT v19.4s, v4.16b, v3.4b[1]
+ UDOT v20.4s, v5.16b, v0.4b[1]
+ UDOT v21.4s, v5.16b, v1.4b[1]
+ UDOT v22.4s, v5.16b, v2.4b[1]
+ UDOT v23.4s, v5.16b, v3.4b[1]
+ UDOT v24.4s, v6.16b, v0.4b[1]
+ UDOT v25.4s, v6.16b, v1.4b[1]
+ LDP q4, q5, [x5], 32
+ UDOT v26.4s, v6.16b, v2.4b[1]
+ UDOT v27.4s, v6.16b, v3.4b[1]
+ UDOT v28.4s, v7.16b, v0.4b[1]
+ UDOT v29.4s, v7.16b, v1.4b[1]
+ UDOT v30.4s, v7.16b, v2.4b[1]
+ UDOT v31.4s, v7.16b, v3.4b[1]
+
+ UDOT v16.4s, v4.16b, v0.4b[2]
+ UDOT v17.4s, v4.16b, v1.4b[2]
+ LDP q6, q7, [x5], 32
+ UDOT v18.4s, v4.16b, v2.4b[2]
+ UDOT v19.4s, v4.16b, v3.4b[2]
+ UDOT v20.4s, v5.16b, v0.4b[2]
+ UDOT v21.4s, v5.16b, v1.4b[2]
+ UDOT v22.4s, v5.16b, v2.4b[2]
+ UDOT v23.4s, v5.16b, v3.4b[2]
+ UDOT v24.4s, v6.16b, v0.4b[2]
+ UDOT v25.4s, v6.16b, v1.4b[2]
+ LDP q4, q5, [x5], 32
+ UDOT v26.4s, v6.16b, v2.4b[2]
+ UDOT v27.4s, v6.16b, v3.4b[2]
+ UDOT v28.4s, v7.16b, v0.4b[2]
+ UDOT v29.4s, v7.16b, v1.4b[2]
+ UDOT v30.4s, v7.16b, v2.4b[2]
+ UDOT v31.4s, v7.16b, v3.4b[2]
+
+ UDOT v16.4s, v4.16b, v0.4b[3]
+ UDOT v17.4s, v4.16b, v1.4b[3]
+ LDP q6, q7, [x5], 32
+ UDOT v18.4s, v4.16b, v2.4b[3]
+ UDOT v19.4s, v4.16b, v3.4b[3]
+ UDOT v20.4s, v5.16b, v0.4b[3]
+ UDOT v21.4s, v5.16b, v1.4b[3]
+ UDOT v22.4s, v5.16b, v2.4b[3]
+ UDOT v23.4s, v5.16b, v3.4b[3]
+ UDOT v24.4s, v6.16b, v0.4b[3]
+ UDOT v25.4s, v6.16b, v1.4b[3]
+ UDOT v26.4s, v6.16b, v2.4b[3]
+ UDOT v27.4s, v6.16b, v3.4b[3]
+ SUBS x0, x0, 16
+ UDOT v28.4s, v7.16b, v0.4b[3]
+ UDOT v29.4s, v7.16b, v1.4b[3]
+ UDOT v30.4s, v7.16b, v2.4b[3]
+ UDOT v31.4s, v7.16b, v3.4b[3]
+ B.HS 2b
+
+ # Is there a remainder?- 4 to 12 bytes of A
+ TST x0, 15
+ B.NE 4f
+
+3:
+ # ks loop
+ SUBS x9, x9, 32 // ks -= MR * sizeof(int8_t*)
+ B.HI 1b
+
+ ADDP v0.4s, v12.4s, v12.4s
+ ADDP v1.4s, v13.4s, v13.4s
+ ADDP v2.4s, v14.4s, v14.4s
+ ADDP v3.4s, v15.4s, v15.4s
+ ADDP v12.4s, v0.4s, v0.4s
+ ADDP v13.4s, v1.4s, v1.4s
+ ADDP v14.4s, v2.4s, v2.4s
+ ADDP v15.4s, v3.4s, v3.4s
+
+ # Subtract zero point from accumulators
+ SUB v16.4s, v16.4s, v12.4s
+ SUB v17.4s, v17.4s, v13.4s
+ SUB v18.4s, v18.4s, v14.4s
+ SUB v19.4s, v19.4s, v15.4s
+ SUB v20.4s, v20.4s, v12.4s
+ SUB v21.4s, v21.4s, v13.4s
+ SUB v22.4s, v22.4s, v14.4s
+ SUB v23.4s, v23.4s, v15.4s
+ SUB v24.4s, v24.4s, v12.4s
+ SUB v25.4s, v25.4s, v13.4s
+ SUB v26.4s, v26.4s, v14.4s
+ SUB v27.4s, v27.4s, v15.4s
+ SUB v28.4s, v28.4s, v12.4s
+ SUB v29.4s, v29.4s, v13.4s
+ SUB v30.4s, v30.4s, v14.4s
+ SUB v31.4s, v31.4s, v15.4s
+
+ SCVTF v16.4s, v16.4s
+ SCVTF v17.4s, v17.4s
+ # Apply params - scale, bias and clamp
+ LD1R {v4.4s}, [x11], 4
+ SCVTF v18.4s, v18.4s
+ SCVTF v19.4s, v19.4s
+ SCVTF v20.4s, v20.4s
+ SCVTF v21.4s, v21.4s
+ SCVTF v22.4s, v22.4s
+ SCVTF v23.4s, v23.4s
+ SCVTF v24.4s, v24.4s
+ SCVTF v25.4s, v25.4s
+ SCVTF v26.4s, v26.4s
+ SCVTF v27.4s, v27.4s
+ SCVTF v28.4s, v28.4s
+ SCVTF v29.4s, v29.4s
+ SCVTF v30.4s, v30.4s
+ SCVTF v31.4s, v31.4s
+
+ FMUL v16.4s, v16.4s, v4.4s
+ FMUL v17.4s, v17.4s, v4.4s
+ FMUL v18.4s, v18.4s, v4.4s
+ FMUL v19.4s, v19.4s, v4.4s
+ FMUL v20.4s, v20.4s, v4.4s
+ FMUL v21.4s, v21.4s, v4.4s
+ FMUL v22.4s, v22.4s, v4.4s
+ FMUL v23.4s, v23.4s, v4.4s
+ FMUL v24.4s, v24.4s, v4.4s
+ FMUL v25.4s, v25.4s, v4.4s
+ FMUL v26.4s, v26.4s, v4.4s
+ FMUL v27.4s, v27.4s, v4.4s
+ FMUL v28.4s, v28.4s, v4.4s
+ FMUL v29.4s, v29.4s, v4.4s
+ FMUL v30.4s, v30.4s, v4.4s
+ FMUL v31.4s, v31.4s, v4.4s
+
+ FCVTNS v16.4s, v16.4s
+ FCVTNS v17.4s, v17.4s
+ FCVTNS v18.4s, v18.4s
+ FCVTNS v19.4s, v19.4s
+ FCVTNS v20.4s, v20.4s
+ FCVTNS v21.4s, v21.4s
+ FCVTNS v22.4s, v22.4s
+ FCVTNS v23.4s, v23.4s
+ FCVTNS v24.4s, v24.4s
+ FCVTNS v25.4s, v25.4s
+ FCVTNS v26.4s, v26.4s
+ FCVTNS v27.4s, v27.4s
+ FCVTNS v28.4s, v28.4s
+ FCVTNS v29.4s, v29.4s
+ FCVTNS v30.4s, v30.4s
+ FCVTNS v31.4s, v31.4s
+
+ SQXTN v16.4h, v16.4s
+ SQXTN v17.4h, v17.4s
+ SQXTN v18.4h, v18.4s
+ SQXTN v19.4h, v19.4s
+ SQXTN v24.4h, v24.4s
+ SQXTN v25.4h, v25.4s
+ SQXTN v26.4h, v26.4s
+ SQXTN v27.4h, v27.4s
+ LD1R {v6.8h}, [x11], 2 // add bias
+
+ SQXTN2 v16.8h, v20.4s
+ SQXTN2 v17.8h, v21.4s
+ SQXTN2 v18.8h, v22.4s
+ SQXTN2 v19.8h, v23.4s
+ SQXTN2 v24.8h, v28.4s
+ SQXTN2 v25.8h, v29.4s
+ SQXTN2 v26.8h, v30.4s
+ SQXTN2 v27.8h, v31.4s
+
+ SQADD v16.8h, v16.8h, v6.8h
+ SQADD v17.8h, v17.8h, v6.8h
+ SQADD v18.8h, v18.8h, v6.8h
+ SQADD v19.8h, v19.8h, v6.8h
+ SQADD v24.8h, v24.8h, v6.8h
+ SQADD v25.8h, v25.8h, v6.8h
+ SQADD v26.8h, v26.8h, v6.8h
+ SQADD v27.8h, v27.8h, v6.8h
+ LD1R {v4.16b}, [x11], 1 // clamp min value
+
+ SQXTUN v0.8b, v16.8h
+ SQXTUN v1.8b, v17.8h
+ SQXTUN v2.8b, v18.8h
+ SQXTUN v3.8b, v19.8h
+ LD1R {v5.16b}, [x11] // clamp max value
+ SQXTUN2 v0.16b, v24.8h
+ SQXTUN2 v1.16b, v25.8h
+ SQXTUN2 v2.16b, v26.8h
+ SQXTUN2 v3.16b, v27.8h
+ LDR x0, [sp, 48] // Load cn_stride
+
+ UMAX v0.16b, v0.16b, v4.16b
+ UMAX v1.16b, v1.16b, v4.16b
+ SUB x11, x11, 7 // rewind params pointer
+ UMAX v2.16b, v2.16b, v4.16b
+ UMAX v3.16b, v3.16b, v4.16b
+ SUBS x1, x1, 16
+ UMIN v0.16b, v0.16b, v5.16b
+ UMIN v1.16b, v1.16b, v5.16b
+ UMIN v2.16b, v2.16b, v5.16b
+ UMIN v3.16b, v3.16b, v5.16b
+ B.LO 6f
+
+ # Store full 4 x 16
+ ST1 {v3.16b}, [x7], x0
+ ST1 {v2.16b}, [x17], x0
+ ST1 {v1.16b}, [x16], x0
+ ST1 {v0.16b}, [x6], x0
+
+ SUB x4, x4, x3 // a -= ks
+
+ # nc loop
+ B.HI 0b
+
+ # Restore d8,d12-d15 from stack
+ LDP d14, d15, [sp, 32]
+ LDP d12, d13, [sp, 16]
+ LDR d8, [sp], 48
+ RET
+
+ # Remainder- 8 bytes of A
+ .p2align 3
+4:
+ # Is there a remainder?- 8 bytes of A
+ TBZ x0, 3, 5f
+
+ LDR d0, [x13], 8
+ LDR q4, [x5], 16
+ LDR d1, [x14], 8
+ LDR d2, [x15], 8
+ LDR d3, [x10], 8
+ LDR q5, [x5], 16
+
+ UDOT v12.4s, v8.16b, v0.16b // update zero point
+ UDOT v13.4s, v8.16b, v1.16b
+ UDOT v14.4s, v8.16b, v2.16b
+ UDOT v15.4s, v8.16b, v3.16b
+
+ UDOT v16.4s, v4.16b, v0.4b[0]
+ UDOT v17.4s, v4.16b, v1.4b[0]
+ LDP q6, q7, [x5], 32
+ UDOT v18.4s, v4.16b, v2.4b[0]
+ UDOT v19.4s, v4.16b, v3.4b[0]
+ UDOT v20.4s, v5.16b, v0.4b[0]
+ UDOT v21.4s, v5.16b, v1.4b[0]
+ UDOT v22.4s, v5.16b, v2.4b[0]
+ UDOT v23.4s, v5.16b, v3.4b[0]
+ UDOT v24.4s, v6.16b, v0.4b[0]
+ UDOT v25.4s, v6.16b, v1.4b[0]
+ LDP q4, q5, [x5], 32
+ UDOT v26.4s, v6.16b, v2.4b[0]
+ UDOT v27.4s, v6.16b, v3.4b[0]
+ UDOT v28.4s, v7.16b, v0.4b[0]
+ UDOT v29.4s, v7.16b, v1.4b[0]
+ UDOT v30.4s, v7.16b, v2.4b[0]
+ UDOT v31.4s, v7.16b, v3.4b[0]
+ UDOT v16.4s, v4.16b, v0.4b[1]
+ UDOT v17.4s, v4.16b, v1.4b[1]
+ LDP q6, q7, [x5], 32
+ UDOT v18.4s, v4.16b, v2.4b[1]
+ UDOT v19.4s, v4.16b, v3.4b[1]
+ UDOT v20.4s, v5.16b, v0.4b[1]
+ UDOT v21.4s, v5.16b, v1.4b[1]
+ UDOT v22.4s, v5.16b, v2.4b[1]
+ UDOT v23.4s, v5.16b, v3.4b[1]
+ UDOT v24.4s, v6.16b, v0.4b[1]
+ UDOT v25.4s, v6.16b, v1.4b[1]
+ UDOT v26.4s, v6.16b, v2.4b[1]
+ UDOT v27.4s, v6.16b, v3.4b[1]
+ UDOT v28.4s, v7.16b, v0.4b[1]
+ UDOT v29.4s, v7.16b, v1.4b[1]
+ UDOT v30.4s, v7.16b, v2.4b[1]
+ UDOT v31.4s, v7.16b, v3.4b[1]
+ # Is there a remainder?- 4 bytes of A
+ TBZ x0, 2, 3b
+
+ # Remainder- 4 bytes of A
+5:
+ LDR s0, [x13], 4
+ LDR q4, [x5], 16
+ LDR s1, [x14], 4
+ LDR s2, [x15], 4
+ LDR s3, [x10], 4
+ LDR q5, [x5], 16
+
+ UDOT v12.4s, v8.16b, v0.16b // update zero point
+ UDOT v13.4s, v8.16b, v1.16b
+ UDOT v14.4s, v8.16b, v2.16b
+ UDOT v15.4s, v8.16b, v3.16b
+
+ UDOT v16.4s, v4.16b, v0.4b[0]
+ UDOT v17.4s, v4.16b, v1.4b[0]
+ UDOT v18.4s, v4.16b, v2.4b[0]
+ UDOT v19.4s, v4.16b, v3.4b[0]
+ LDP q6, q7, [x5], 32
+ UDOT v20.4s, v5.16b, v0.4b[0]
+ UDOT v21.4s, v5.16b, v1.4b[0]
+ UDOT v22.4s, v5.16b, v2.4b[0]
+ UDOT v23.4s, v5.16b, v3.4b[0]
+ UDOT v24.4s, v6.16b, v0.4b[0]
+ UDOT v25.4s, v6.16b, v1.4b[0]
+ UDOT v26.4s, v6.16b, v2.4b[0]
+ UDOT v27.4s, v6.16b, v3.4b[0]
+ UDOT v28.4s, v7.16b, v0.4b[0]
+ UDOT v29.4s, v7.16b, v1.4b[0]
+ UDOT v30.4s, v7.16b, v2.4b[0]
+ UDOT v31.4s, v7.16b, v3.4b[0]
+ B 3b
+
+ # Store odd width
+ .p2align 3
+6:
+ TBZ x1, 3, 7f
+ STR d3, [x7], 8
+ STR d2, [x17], 8
+ DUP d3, v3.d[1]
+ DUP d2, v2.d[1]
+ STR d1, [x16], 8
+ STR d0, [x6], 8
+ DUP d1, v1.d[1]
+ DUP d0, v0.d[1]
+7:
+ TBZ x1, 2, 8f
+ STR s3, [x7], 4
+ STR s2, [x17], 4
+ DUP s3, v3.s[1]
+ DUP s2, v2.s[1]
+ STR s1, [x16], 4
+ STR s0, [x6], 4
+ DUP s1, v1.s[1]
+ DUP s0, v0.s[1]
+8:
+ TBZ x1, 1, 9f
+ STR h3, [x7], 2
+ STR h2, [x17], 2
+ DUP h3, v3.h[1]
+ DUP h2, v2.h[1]
+ STR h1, [x16], 2
+ STR h0, [x6], 2
+ DUP h1, v1.h[1]
+ DUP h0, v0.h[1]
+9:
+ TBZ x1, 0, 10f
+ STR b3, [x7]
+ STR b2, [x17]
+ STR b1, [x16]
+ STR b0, [x6]
+10:
+ # Restore d8,d12-d15 from stack
+ LDP d14, d15, [sp, 32]
+ LDP d12, d13, [sp, 16]
+ LDR d8, [sp], 48
+ RET
+
+END_FUNCTION xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
+
+#ifdef __ELF__
+.section ".note.GNU-stack","",%progbits
+#endif
diff --git a/src/qu8-igemm/gen/4x16c4-minmax-fp32-neondot.c b/src/qu8-igemm/gen/4x16c4-minmax-fp32-neondot.c
new file mode 100644
index 0000000..1d095a6
--- /dev/null
+++ b/src/qu8-igemm/gen/4x16c4-minmax-fp32-neondot.c
@@ -0,0 +1,376 @@
+// Auto-generated file. Do not edit!
+// Template: src/qu8-igemm/c4-neondot.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/intrinsics-polyfill.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const uint8_t** restrict a,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const uint8_t* zero,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 4 * sizeof(uint8_t));
+ uint8_t* c0 = c;
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ const uint8x8_t va_zero_point = vld1_dup_u8(¶ms->fp32_neonv8.kernel_zero_point[0]);
+
+ do {
+ // Initialize accumulators with bias. 16 bias values are loaded from the
+ // weight matrix, at the start of the group of 16 columns.
+ uint32x4_t vpacc0x0123 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0x4567 = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0x89AB = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc0xCDEF = vld1q_u32(w); w = (const void*) ((const uint32_t*) w + 4);
+ uint32x4_t vpacc1x0123 = vpacc0x0123;
+ uint32x4_t vpacc1x4567 = vpacc0x4567;
+ uint32x4_t vpacc1x89AB = vpacc0x89AB;
+ uint32x4_t vpacc1xCDEF = vpacc0xCDEF;
+ uint32x4_t vpacc2x0123 = vpacc0x0123;
+ uint32x4_t vpacc2x4567 = vpacc0x4567;
+ uint32x4_t vpacc2x89AB = vpacc0x89AB;
+ uint32x4_t vpacc2xCDEF = vpacc0xCDEF;
+ uint32x4_t vpacc3x0123 = vpacc0x0123;
+ uint32x4_t vpacc3x4567 = vpacc0x4567;
+ uint32x4_t vpacc3x89AB = vpacc0x89AB;
+ uint32x4_t vpacc3xCDEF = vpacc0xCDEF;
+ uint32x2_t vnacc0 = vmov_n_u32(0);
+ uint32x2_t vnacc1 = vmov_n_u32(0);
+ uint32x2_t vnacc2 = vmov_n_u32(0);
+ uint32x2_t vnacc3 = vmov_n_u32(0);
+
+ size_t p = ks;
+ do {
+ const uint8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const uint8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const uint8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ const uint8_t* restrict a3 = a[3];
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ // Inner accumulation loop along the 16 columns.
+ size_t k = kc;
+ // 2x partial unrolled loop to load 8 bytes at a time.
+ while (k >= 8 * sizeof(uint8_t)) {
+ // Load a 4x8 block of activations.
+ const uint8x8_t va0x01234567 = vld1_u8(a0); a0 += 8;
+ const uint8x8_t va1x01234567 = vld1_u8(a1); a1 += 8;
+ const uint8x8_t va2x01234567 = vld1_u8(a2); a2 += 8;
+ const uint8x8_t va3x01234567 = vld1_u8(a3); a3 += 8;
+
+ // Load a 8x16 block of weights.
+ const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb4567xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+
+ // Multiply-accumulate: 4x8 * 8x16 --> 4x16.
+ vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb4567x0123, va0x01234567, 1);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb4567x4567, va0x01234567, 1);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb4567x89AB, va0x01234567, 1);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb4567xCDEF, va0x01234567, 1);
+ vnacc1 = vdot_u32(vnacc1, va_zero_point, va1x01234567);
+ vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb0123x0123, va1x01234567, 0);
+ vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb0123x4567, va1x01234567, 0);
+ vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb0123x89AB, va1x01234567, 0);
+ vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb0123xCDEF, va1x01234567, 0);
+ vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb4567x0123, va1x01234567, 1);
+ vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb4567x4567, va1x01234567, 1);
+ vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb4567x89AB, va1x01234567, 1);
+ vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb4567xCDEF, va1x01234567, 1);
+ vnacc2 = vdot_u32(vnacc2, va_zero_point, va2x01234567);
+ vpacc2x0123 = vdotq_lane_u32(vpacc2x0123, vb0123x0123, va2x01234567, 0);
+ vpacc2x4567 = vdotq_lane_u32(vpacc2x4567, vb0123x4567, va2x01234567, 0);
+ vpacc2x89AB = vdotq_lane_u32(vpacc2x89AB, vb0123x89AB, va2x01234567, 0);
+ vpacc2xCDEF = vdotq_lane_u32(vpacc2xCDEF, vb0123xCDEF, va2x01234567, 0);
+ vpacc2x0123 = vdotq_lane_u32(vpacc2x0123, vb4567x0123, va2x01234567, 1);
+ vpacc2x4567 = vdotq_lane_u32(vpacc2x4567, vb4567x4567, va2x01234567, 1);
+ vpacc2x89AB = vdotq_lane_u32(vpacc2x89AB, vb4567x89AB, va2x01234567, 1);
+ vpacc2xCDEF = vdotq_lane_u32(vpacc2xCDEF, vb4567xCDEF, va2x01234567, 1);
+ vnacc3 = vdot_u32(vnacc3, va_zero_point, va3x01234567);
+ vpacc3x0123 = vdotq_lane_u32(vpacc3x0123, vb0123x0123, va3x01234567, 0);
+ vpacc3x4567 = vdotq_lane_u32(vpacc3x4567, vb0123x4567, va3x01234567, 0);
+ vpacc3x89AB = vdotq_lane_u32(vpacc3x89AB, vb0123x89AB, va3x01234567, 0);
+ vpacc3xCDEF = vdotq_lane_u32(vpacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
+ vpacc3x0123 = vdotq_lane_u32(vpacc3x0123, vb4567x0123, va3x01234567, 1);
+ vpacc3x4567 = vdotq_lane_u32(vpacc3x4567, vb4567x4567, va3x01234567, 1);
+ vpacc3x89AB = vdotq_lane_u32(vpacc3x89AB, vb4567x89AB, va3x01234567, 1);
+ vpacc3xCDEF = vdotq_lane_u32(vpacc3xCDEF, vb4567xCDEF, va3x01234567, 1);
+
+ k -= 8 * sizeof(uint8_t);
+ }
+ // Handle up to 4 final positions of `k`
+ if XNN_UNLIKELY(k != 0) {
+ // Load a 4x4 block of activations.
+ const uint8x8_t va0x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a0, vmov_n_u32(0), 0)); a0 += 4;
+ const uint8x8_t va1x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a1, vmov_n_u32(0), 0)); a1 += 4;
+ const uint8x8_t va2x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a2, vmov_n_u32(0), 0)); a2 += 4;
+ const uint8x8_t va3x01234567 = vreinterpret_u8_u32(vld1_lane_u32((const void*) a3, vmov_n_u32(0), 0)); a3 += 4;
+
+ // Load a 4x16 block of weights.
+ const uint8x16_t vb0123x0123 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x4567 = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123x89AB = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+ const uint8x16_t vb0123xCDEF = vld1q_u8(w); w = (const void*) ((const uint8_t*) w + 16);
+
+ // Multiply-accumulate: 4x4 * 4x16 --> 4x16.
+ vnacc0 = vdot_u32(vnacc0, va_zero_point, va0x01234567);
+ vpacc0x0123 = vdotq_lane_u32(vpacc0x0123, vb0123x0123, va0x01234567, 0);
+ vpacc0x4567 = vdotq_lane_u32(vpacc0x4567, vb0123x4567, va0x01234567, 0);
+ vpacc0x89AB = vdotq_lane_u32(vpacc0x89AB, vb0123x89AB, va0x01234567, 0);
+ vpacc0xCDEF = vdotq_lane_u32(vpacc0xCDEF, vb0123xCDEF, va0x01234567, 0);
+ vnacc1 = vdot_u32(vnacc1, va_zero_point, va1x01234567);
+ vpacc1x0123 = vdotq_lane_u32(vpacc1x0123, vb0123x0123, va1x01234567, 0);
+ vpacc1x4567 = vdotq_lane_u32(vpacc1x4567, vb0123x4567, va1x01234567, 0);
+ vpacc1x89AB = vdotq_lane_u32(vpacc1x89AB, vb0123x89AB, va1x01234567, 0);
+ vpacc1xCDEF = vdotq_lane_u32(vpacc1xCDEF, vb0123xCDEF, va1x01234567, 0);
+ vnacc2 = vdot_u32(vnacc2, va_zero_point, va2x01234567);
+ vpacc2x0123 = vdotq_lane_u32(vpacc2x0123, vb0123x0123, va2x01234567, 0);
+ vpacc2x4567 = vdotq_lane_u32(vpacc2x4567, vb0123x4567, va2x01234567, 0);
+ vpacc2x89AB = vdotq_lane_u32(vpacc2x89AB, vb0123x89AB, va2x01234567, 0);
+ vpacc2xCDEF = vdotq_lane_u32(vpacc2xCDEF, vb0123xCDEF, va2x01234567, 0);
+ vnacc3 = vdot_u32(vnacc3, va_zero_point, va3x01234567);
+ vpacc3x0123 = vdotq_lane_u32(vpacc3x0123, vb0123x0123, va3x01234567, 0);
+ vpacc3x4567 = vdotq_lane_u32(vpacc3x4567, vb0123x4567, va3x01234567, 0);
+ vpacc3x89AB = vdotq_lane_u32(vpacc3x89AB, vb0123x89AB, va3x01234567, 0);
+ vpacc3xCDEF = vdotq_lane_u32(vpacc3xCDEF, vb0123xCDEF, va3x01234567, 0);
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ // Subtract zero point from accumulators.
+ vnacc0 = vpadd_u32(vnacc0, vnacc0);
+ const uint32x4_t vnacc0x0123 = vcombine_u32(vnacc0, vnacc0);
+ int32x4_t vacc0x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x0123, vnacc0x0123));
+ int32x4_t vacc0x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc0x4567, vnacc0x0123));
+ int32x4_t vacc0x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc0x89AB, vnacc0x0123));
+ int32x4_t vacc0xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc0xCDEF, vnacc0x0123));
+ vnacc1 = vpadd_u32(vnacc1, vnacc1);
+ const uint32x4_t vnacc1x0123 = vcombine_u32(vnacc1, vnacc1);
+ int32x4_t vacc1x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc1x0123, vnacc1x0123));
+ int32x4_t vacc1x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc1x4567, vnacc1x0123));
+ int32x4_t vacc1x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc1x89AB, vnacc1x0123));
+ int32x4_t vacc1xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc1xCDEF, vnacc1x0123));
+ vnacc2 = vpadd_u32(vnacc2, vnacc2);
+ const uint32x4_t vnacc2x0123 = vcombine_u32(vnacc2, vnacc2);
+ int32x4_t vacc2x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc2x0123, vnacc2x0123));
+ int32x4_t vacc2x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc2x4567, vnacc2x0123));
+ int32x4_t vacc2x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc2x89AB, vnacc2x0123));
+ int32x4_t vacc2xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc2xCDEF, vnacc2x0123));
+ vnacc3 = vpadd_u32(vnacc3, vnacc3);
+ const uint32x4_t vnacc3x0123 = vcombine_u32(vnacc3, vnacc3);
+ int32x4_t vacc3x0123 = vreinterpretq_s32_u32(vsubq_u32(vpacc3x0123, vnacc3x0123));
+ int32x4_t vacc3x4567 = vreinterpretq_s32_u32(vsubq_u32(vpacc3x4567, vnacc3x0123));
+ int32x4_t vacc3x89AB = vreinterpretq_s32_u32(vsubq_u32(vpacc3x89AB, vnacc3x0123));
+ int32x4_t vacc3xCDEF = vreinterpretq_s32_u32(vsubq_u32(vpacc3xCDEF, vnacc3x0123));
+
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc0x89AB = vcvtq_f32_s32(vacc0x89AB);
+ float32x4_t vfpacc0xCDEF = vcvtq_f32_s32(vacc0xCDEF);
+ float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
+ float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);
+ float32x4_t vfpacc1x89AB = vcvtq_f32_s32(vacc1x89AB);
+ float32x4_t vfpacc1xCDEF = vcvtq_f32_s32(vacc1xCDEF);
+ float32x4_t vfpacc2x0123 = vcvtq_f32_s32(vacc2x0123);
+ float32x4_t vfpacc2x4567 = vcvtq_f32_s32(vacc2x4567);
+ float32x4_t vfpacc2x89AB = vcvtq_f32_s32(vacc2x89AB);
+ float32x4_t vfpacc2xCDEF = vcvtq_f32_s32(vacc2xCDEF);
+ float32x4_t vfpacc3x0123 = vcvtq_f32_s32(vacc3x0123);
+ float32x4_t vfpacc3x4567 = vcvtq_f32_s32(vacc3x4567);
+ float32x4_t vfpacc3x89AB = vcvtq_f32_s32(vacc3x89AB);
+ float32x4_t vfpacc3xCDEF = vcvtq_f32_s32(vacc3xCDEF);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neonv8.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc0x89AB = vmulq_f32(vfpacc0x89AB, vscale);
+ vfpacc0xCDEF = vmulq_f32(vfpacc0xCDEF, vscale);
+ vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale);
+ vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale);
+ vfpacc1x89AB = vmulq_f32(vfpacc1x89AB, vscale);
+ vfpacc1xCDEF = vmulq_f32(vfpacc1xCDEF, vscale);
+ vfpacc2x0123 = vmulq_f32(vfpacc2x0123, vscale);
+ vfpacc2x4567 = vmulq_f32(vfpacc2x4567, vscale);
+ vfpacc2x89AB = vmulq_f32(vfpacc2x89AB, vscale);
+ vfpacc2xCDEF = vmulq_f32(vfpacc2xCDEF, vscale);
+ vfpacc3x0123 = vmulq_f32(vfpacc3x0123, vscale);
+ vfpacc3x4567 = vmulq_f32(vfpacc3x4567, vscale);
+ vfpacc3x89AB = vmulq_f32(vfpacc3x89AB, vscale);
+ vfpacc3xCDEF = vmulq_f32(vfpacc3xCDEF, vscale);
+
+ vacc0x0123 = vcvtnq_s32_f32(vfpacc0x0123);
+ vacc0x4567 = vcvtnq_s32_f32(vfpacc0x4567);
+ vacc0x89AB = vcvtnq_s32_f32(vfpacc0x89AB);
+ vacc0xCDEF = vcvtnq_s32_f32(vfpacc0xCDEF);
+ vacc1x0123 = vcvtnq_s32_f32(vfpacc1x0123);
+ vacc1x4567 = vcvtnq_s32_f32(vfpacc1x4567);
+ vacc1x89AB = vcvtnq_s32_f32(vfpacc1x89AB);
+ vacc1xCDEF = vcvtnq_s32_f32(vfpacc1xCDEF);
+ vacc2x0123 = vcvtnq_s32_f32(vfpacc2x0123);
+ vacc2x4567 = vcvtnq_s32_f32(vfpacc2x4567);
+ vacc2x89AB = vcvtnq_s32_f32(vfpacc2x89AB);
+ vacc2xCDEF = vcvtnq_s32_f32(vfpacc2xCDEF);
+ vacc3x0123 = vcvtnq_s32_f32(vfpacc3x0123);
+ vacc3x4567 = vcvtnq_s32_f32(vfpacc3x4567);
+ vacc3x89AB = vcvtnq_s32_f32(vfpacc3x89AB);
+ vacc3xCDEF = vcvtnq_s32_f32(vfpacc3xCDEF);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->fp32_neonv8.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+ const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+ uint8x16_t vout0x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc0x89ABCDEF);
+ uint8x16_t vout1x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc1x01234567), vacc1x89ABCDEF);
+ uint8x16_t vout2x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc2x89ABCDEF);
+ uint8x16_t vout3x0123456789ABCDEF = vqmovun_high_s16(vqmovun_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+ const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+ uint8x16_t vout0x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc0x89ABCDEF));
+ uint8x16_t vout1x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc1x01234567), vqmovun_s16(vacc1x89ABCDEF));
+ uint8x16_t vout2x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc2x89ABCDEF));
+ uint8x16_t vout3x0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc3x01234567), vqmovun_s16(vacc3x89ABCDEF));
+#endif
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neonv8.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neonv8.output_max);
+
+ vout0x0123456789ABCDEF = vmaxq_u8(vout0x0123456789ABCDEF, voutput_min);
+ vout1x0123456789ABCDEF = vmaxq_u8(vout1x0123456789ABCDEF, voutput_min);
+ vout2x0123456789ABCDEF = vmaxq_u8(vout2x0123456789ABCDEF, voutput_min);
+ vout3x0123456789ABCDEF = vmaxq_u8(vout3x0123456789ABCDEF, voutput_min);
+
+ vout0x0123456789ABCDEF = vminq_u8(vout0x0123456789ABCDEF, voutput_max);
+ vout1x0123456789ABCDEF = vminq_u8(vout1x0123456789ABCDEF, voutput_max);
+ vout2x0123456789ABCDEF = vminq_u8(vout2x0123456789ABCDEF, voutput_max);
+ vout3x0123456789ABCDEF = vminq_u8(vout3x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_u8(c3 + 0, vout3x0123456789ABCDEF);
+ vst1q_u8(c2 + 0, vout2x0123456789ABCDEF);
+ vst1q_u8(c1 + 0, vout1x0123456789ABCDEF);
+ vst1q_u8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
+ c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const uint8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ uint8x16_t vout2x01234567_3x01234567 = vcombine_u8(vget_low_u8(vout2x0123456789ABCDEF), vget_low_u8(vout3x0123456789ABCDEF));
+ uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vget_low_u8(vout0x0123456789ABCDEF), vget_low_u8(vout1x0123456789ABCDEF));
+ if (nc & 8) {
+ vst1_u8(c3, vget_high_u8(vout2x01234567_3x01234567)); c3 += 8;
+ vst1_u8(c2, vget_low_u8(vout2x01234567_3x01234567)); c2 += 8;
+ vst1_u8(c1, vget_high_u8(vout0x01234567_1x01234567)); c1 += 8;
+ vst1_u8(c0, vget_low_u8(vout0x01234567_1x01234567)); c0 += 8;
+ vout2x01234567_3x01234567 = vcombine_u8(vget_high_u8(vout2x0123456789ABCDEF), vget_high_u8(vout3x0123456789ABCDEF));
+ vout0x01234567_1x01234567 = vcombine_u8(vget_high_u8(vout0x0123456789ABCDEF), vget_high_u8(vout1x0123456789ABCDEF));
+ }
+ if (nc & 4) {
+ vst1q_lane_u32((void*) c3, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 2); c3 += 4;
+ vst1q_lane_u32((void*) c2, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 0); c2 += 4;
+ vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+ vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16((void*) c3, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 4); c3 += 2;
+ vst1q_lane_u16((void*) c2, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 0); c2 += 2;
+ vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+ vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
+ vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
+ vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qu8-igemm/gen/4x8-minmax-fp32-neon-mlal-lane.c b/src/qu8-igemm/gen/4x8-minmax-fp32-neon-mlal-lane.c
new file mode 100644
index 0000000..5599828
--- /dev/null
+++ b/src/qu8-igemm/gen/4x8-minmax-fp32-neon-mlal-lane.c
@@ -0,0 +1,410 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/neon-mlal-lane.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/igemm.h>
+
+
+void xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const uint8_t** restrict a,
+ const void* restrict w,
+ uint8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const uint8_t* zero,
+ const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(uint8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ uint8_t* c0 = c;
+ uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ uint8_t* c2 = (uint8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ uint8_t* c3 = (uint8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->fp32_neon.kernel_zero_point[0]);
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((const int32_t*) w + 4);
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc2x0123 = vacc0x0123;
+ int32x4_t vacc2x4567 = vacc0x4567;
+ int32x4_t vacc3x0123 = vacc0x0123;
+ int32x4_t vacc3x4567 = vacc0x4567;
+
+ size_t p = ks;
+ do {
+ const uint8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const uint8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const uint8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const uint8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ const uint8_t* restrict a3 = a[3];
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const uint8_t*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+ while (k >= 8 * sizeof(uint8_t)) {
+ const uint8x8_t va0 = vld1_u8(a0); a0 += 8;
+ const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
+ const uint8x8_t va1 = vld1_u8(a1); a1 += 8;
+ const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
+ const uint8x8_t va2 = vld1_u8(a2); a2 += 8;
+ const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
+ const uint8x8_t va3 = vld1_u8(a3); a3 += 8;
+ const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));
+
+ const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+
+
+ const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ const uint8x8_t vb01234567c7 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c7 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c7, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa0), 3);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa1), 3);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa2), 3);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c7), vget_high_s16(vxa3), 3);
+
+ k -= 8 * sizeof(uint8_t);
+ }
+ if XNN_UNLIKELY(k != 0) {
+ const uint8x8_t va0 = vld1_u8(a0); a0 = (const uint8_t*) ((uintptr_t) a0 + k);
+ const int16x8_t vxa0 = vreinterpretq_s16_u16(vmovl_u8(va0));
+ const uint8x8_t va1 = vld1_u8(a1); a1 = (const uint8_t*) ((uintptr_t) a1 + k);
+ const int16x8_t vxa1 = vreinterpretq_s16_u16(vmovl_u8(va1));
+ const uint8x8_t va2 = vld1_u8(a2); a2 = (const uint8_t*) ((uintptr_t) a2 + k);
+ const int16x8_t vxa2 = vreinterpretq_s16_u16(vmovl_u8(va2));
+ const uint8x8_t va3 = vld1_u8(a3); a3 = (const uint8_t*) ((uintptr_t) a3 + k);
+ const int16x8_t vxa3 = vreinterpretq_s16_u16(vmovl_u8(va3));
+
+ const uint8x8_t vb01234567c0 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c0 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c0, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c0), vget_low_s16(vxa3), 0);
+
+ if (k >= 2 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c1 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c1 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c1, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c1), vget_low_s16(vxa3), 1);
+
+ if (k > 2 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c2 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c2 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c2, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c2), vget_low_s16(vxa3), 2);
+
+ if (k >= 4 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c3 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c3 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c3, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa0), 3);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa1), 3);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa2), 3);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c3), vget_low_s16(vxa3), 3);
+
+ if (k > 4 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c4 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c4 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c4, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa0), 0);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa1), 0);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa2), 0);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c4), vget_high_s16(vxa3), 0);
+
+ if (k >= 6 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c5 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c5 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c5, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa0), 1);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa1), 1);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa2), 1);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c5), vget_high_s16(vxa3), 1);
+
+ if (k > 6 * sizeof(uint8_t)) {
+ const uint8x8_t vb01234567c6 = vld1_u8(w); w = (const void*) ((const uint8_t*) w + 8);
+ const int16x8_t vxb01234567c6 = vreinterpretq_s16_u16(vsubl_u8(vb01234567c6, vb_zero_point));
+
+ vacc0x0123 = vmlal_lane_s16(vacc0x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc0x4567 = vmlal_lane_s16(vacc0x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa0), 2);
+ vacc1x0123 = vmlal_lane_s16(vacc1x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc1x4567 = vmlal_lane_s16(vacc1x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa1), 2);
+ vacc2x0123 = vmlal_lane_s16(vacc2x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc2x4567 = vmlal_lane_s16(vacc2x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa2), 2);
+ vacc3x0123 = vmlal_lane_s16(vacc3x0123, vget_low_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ vacc3x4567 = vmlal_lane_s16(vacc3x4567, vget_high_s16(vxb01234567c6), vget_high_s16(vxa3), 2);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ // Post-accumulation work
+ float32x4_t vfpacc0x0123 = vcvtq_f32_s32(vacc0x0123);
+ float32x4_t vfpacc0x4567 = vcvtq_f32_s32(vacc0x4567);
+ float32x4_t vfpacc1x0123 = vcvtq_f32_s32(vacc1x0123);
+ float32x4_t vfpacc1x4567 = vcvtq_f32_s32(vacc1x4567);
+ float32x4_t vfpacc2x0123 = vcvtq_f32_s32(vacc2x0123);
+ float32x4_t vfpacc2x4567 = vcvtq_f32_s32(vacc2x4567);
+ float32x4_t vfpacc3x0123 = vcvtq_f32_s32(vacc3x0123);
+ float32x4_t vfpacc3x4567 = vcvtq_f32_s32(vacc3x4567);
+
+ const float32x4_t vscale = vld1q_dup_f32(¶ms->fp32_neon.scale);
+ vfpacc0x0123 = vmulq_f32(vfpacc0x0123, vscale);
+ vfpacc0x4567 = vmulq_f32(vfpacc0x4567, vscale);
+ vfpacc1x0123 = vmulq_f32(vfpacc1x0123, vscale);
+ vfpacc1x4567 = vmulq_f32(vfpacc1x4567, vscale);
+ vfpacc2x0123 = vmulq_f32(vfpacc2x0123, vscale);
+ vfpacc2x4567 = vmulq_f32(vfpacc2x4567, vscale);
+ vfpacc3x0123 = vmulq_f32(vfpacc3x0123, vscale);
+ vfpacc3x4567 = vmulq_f32(vfpacc3x4567, vscale);
+
+ const float32x4_t vmagic_bias = vld1q_dup_f32(¶ms->fp32_neon.magic_bias);
+ vacc0x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x0123, vmagic_bias));
+ vacc0x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc0x4567, vmagic_bias));
+ vacc1x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x0123, vmagic_bias));
+ vacc1x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc1x4567, vmagic_bias));
+ vacc2x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc2x0123, vmagic_bias));
+ vacc2x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc2x4567, vmagic_bias));
+ vacc3x0123 = vreinterpretq_s32_f32(vaddq_f32(vfpacc3x0123, vmagic_bias));
+ vacc3x4567 = vreinterpretq_s32_f32(vaddq_f32(vfpacc3x4567, vmagic_bias));
+
+ const int32x4_t vmagic_bias_less_output_zero_point = vld1q_dup_s32(¶ms->fp32_neon.magic_bias_less_output_zero_point);
+ vacc0x0123 = vqsubq_s32(vacc0x0123, vmagic_bias_less_output_zero_point);
+ vacc0x4567 = vqsubq_s32(vacc0x4567, vmagic_bias_less_output_zero_point);
+ vacc1x0123 = vqsubq_s32(vacc1x0123, vmagic_bias_less_output_zero_point);
+ vacc1x4567 = vqsubq_s32(vacc1x4567, vmagic_bias_less_output_zero_point);
+ vacc2x0123 = vqsubq_s32(vacc2x0123, vmagic_bias_less_output_zero_point);
+ vacc2x4567 = vqsubq_s32(vacc2x4567, vmagic_bias_less_output_zero_point);
+ vacc3x0123 = vqsubq_s32(vacc3x0123, vmagic_bias_less_output_zero_point);
+ vacc3x4567 = vqsubq_s32(vacc3x4567, vmagic_bias_less_output_zero_point);
+
+#if XNN_ARCH_ARM64
+ int16x8_t vacc0x01234567 = vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567);
+ int16x8_t vacc1x01234567 = vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567);
+ int16x8_t vacc2x01234567 = vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567);
+ int16x8_t vacc3x01234567 = vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567);
+
+
+ uint8x16_t vout0x01234567_1x01234567 = vqmovun_high_s16(vqmovun_s16(vacc0x01234567), vacc1x01234567);
+ uint8x16_t vout2x01234567_3x01234567 = vqmovun_high_s16(vqmovun_s16(vacc2x01234567), vacc3x01234567);
+#else
+ int16x8_t vacc0x01234567 = vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567));
+ int16x8_t vacc1x01234567 = vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567));
+ int16x8_t vacc2x01234567 = vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567));
+ int16x8_t vacc3x01234567 = vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567));
+
+
+ uint8x16_t vout0x01234567_1x01234567 = vcombine_u8(vqmovun_s16(vacc0x01234567), vqmovun_s16(vacc1x01234567));
+ uint8x16_t vout2x01234567_3x01234567 = vcombine_u8(vqmovun_s16(vacc2x01234567), vqmovun_s16(vacc3x01234567));
+#endif
+
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->fp32_neon.output_min);
+ vout0x01234567_1x01234567 = vmaxq_u8(vout0x01234567_1x01234567, voutput_min);
+ vout2x01234567_3x01234567 = vmaxq_u8(vout2x01234567_3x01234567, voutput_min);
+
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->fp32_neon.output_max);
+ vout0x01234567_1x01234567 = vminq_u8(vout0x01234567_1x01234567, voutput_max);
+ vout2x01234567_3x01234567 = vminq_u8(vout2x01234567_3x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_u8(c3 + 0, vget_high_u8(vout2x01234567_3x01234567));
+ vst1_u8(c2 + 0, vget_low_u8(vout2x01234567_3x01234567));
+ vst1_u8(c1 + 0, vget_high_u8(vout0x01234567_1x01234567));
+ vst1_u8(c0 + 0, vget_low_u8(vout0x01234567_1x01234567));
+
+ c3 = (uint8_t*) ((uintptr_t) c3 + cn_stride);
+ c2 = (uint8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const uint8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1q_lane_u32((void*) c3, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 2); c3 += 4;
+ vst1q_lane_u32((void*) c2, vreinterpretq_u32_u8(vout2x01234567_3x01234567), 0); c2 += 4;
+ vst1q_lane_u32((void*) c1, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32((void*) c0, vreinterpretq_u32_u8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+ vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16((void*) c3, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 4); c3 += 2;
+ vst1q_lane_u16((void*) c2, vreinterpretq_u16_u8(vout2x01234567_3x01234567), 0); c2 += 2;
+ vst1q_lane_u16((void*) c1, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16((void*) c0, vreinterpretq_u16_u8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout2x01234567_3x01234567 = vextq_u8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+ vout0x01234567_1x01234567 = vextq_u8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_u8(c3, vout2x01234567_3x01234567, 8);
+ vst1q_lane_u8(c2, vout2x01234567_3x01234567, 0);
+ vst1q_lane_u8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_u8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 9f916c8..e7b77ab 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -489,6 +489,8 @@
DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane)
DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane)
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane)
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane)
DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane)
DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neon_mlal_lane)
@@ -527,6 +529,13 @@
DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot)
DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot)
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55)
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128)
+
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot)
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot)
+DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot)
+
DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64)
DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__sse2_ld64)
DECLARE_QU8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__sse2_ld64)
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 6a8c51d..c6067b4 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -313,6 +313,8 @@
DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane)
DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane)
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane)
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane)
DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane)
DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neon_mlal_lane)
@@ -351,6 +353,13 @@
DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_rndnu_ukernel_2x32c4__neondot)
DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_rndnu_ukernel_3x32c4__neondot)
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55)
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128)
+
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot)
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot)
+DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot)
+
DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64)
DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_fp32_ukernel_2x4c2__sse2_ld64)
DECLARE_QU8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qu8_igemm_minmax_fp32_ukernel_3x4c2__sse2_ld64)
diff --git a/test/qu8-gemm-minmax-fp32.cc b/test/qu8-gemm-minmax-fp32.cc
index fe384f1..b69bbd2 100644
--- a/test/qu8-gemm-minmax-fp32.cc
+++ b/test/qu8-gemm-minmax-fp32.cc
@@ -23,6 +23,1016 @@
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QU8_GEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
@@ -2042,6 +3052,2531 @@
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_ARM64
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .a_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(37)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(163)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .a_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(37)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(163)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_1X16C4__NEONDOT, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_2X16C4__NEONDOT, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_lt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_gt_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_div_8_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_gt_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_div_16_strided_a) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_GEMM_MINMAX_FP32_4X16C4__NEONDOT, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_GEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8) {
TEST_REQUIRES_X86_SSE2;
diff --git a/test/qu8-gemm-minmax-fp32.yaml b/test/qu8-gemm-minmax-fp32.yaml
index adb0316..6293ad6 100644
--- a/test/qu8-gemm-minmax-fp32.yaml
+++ b/test/qu8-gemm-minmax-fp32.yaml
@@ -2,7 +2,12 @@
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
-
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_1x8__neon_mlal_lane
+ init: xnn_init_qu8_conv_minmax_fp32_neon_params
+ k-block: 8
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_4x8__neon_mlal_lane
+ init: xnn_init_qu8_conv_minmax_fp32_neon_params
+ k-block: 8
- name: xnn_qu8_gemm_minmax_fp32_ukernel_1x16__neon_mlal_lane
init: xnn_init_qu8_conv_minmax_fp32_neon_params
k-block: 8
@@ -15,6 +20,21 @@
- name: xnn_qu8_gemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane
init: xnn_init_qu8_conv_minmax_fp32_neonv8_params
k-block: 8
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+ init: xnn_init_qu8_conv_minmax_fp32_neonv8_params
+ k-block: 16
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
+ init: xnn_init_qu8_conv_minmax_fp32_neonv8_params
+ k-block: 16
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_1x16c4__neondot
+ init: xnn_init_qu8_conv_minmax_fp32_neonv8_params
+ k-block: 8
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_2x16c4__neondot
+ init: xnn_init_qu8_conv_minmax_fp32_neonv8_params
+ k-block: 8
+- name: xnn_qu8_gemm_minmax_fp32_ukernel_4x16c4__neondot
+ init: xnn_init_qu8_conv_minmax_fp32_neonv8_params
+ k-block: 8
- name: xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__sse2_ld64
init: xnn_init_qu8_conv_minmax_fp32_sse2_params
k-block: 8
diff --git a/test/qu8-igemm-minmax-fp32.cc b/test/qu8-igemm-minmax-fp32.cc
index 3c08f1e..2ddfd9e 100644
--- a/test/qu8-igemm-minmax-fp32.cc
+++ b/test/qu8-igemm-minmax-fp32.cc
@@ -23,6 +23,1040 @@
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(43)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 1; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(43)
+ .zero_index(mz)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X8__NEON_MLAL_LANE, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X8__NEON_MLAL_LANE, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(1)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane, xnn_init_qu8_conv_minmax_fp32_neon_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QU8_IGEMM_MINMAX_FP32_1X16__NEON_MLAL_LANE, k_eq_8) {
TEST_REQUIRES_ARM_NEON;
GemmMicrokernelTester()
@@ -2090,6 +3124,2591 @@
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_ARM64
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, a_offset) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, zero) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .zero_index(mz)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_CORTEX_A55, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM64
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, a_offset) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, zero) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .zero_index(mz)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__AARCH64_NEONDOT_LD128, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, a_offset) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(43)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, zero) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t mz = 0; mz < 1; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(43)
+ .zero_index(mz)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_1X16C4__NEONDOT, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, a_offset) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, zero) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t mz = 0; mz < 2; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_2X16C4__NEONDOT, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, k_div_8) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, n_div_16) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, a_offset) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, zero) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, qmin) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, qmax) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, strided_cm) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, no_a_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, no_b_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+
+ TEST(QU8_IGEMM_MINMAX_FP32_4X16C4__NEONDOT, no_zero_point) {
+ TEST_REQUIRES_ARM_NEON_DOT;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(4)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .a_zero_point(0)
+ .b_zero_point(0)
+ .Test(xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot, xnn_init_qu8_conv_minmax_fp32_neonv8_params, xnn_init_qu8_requantization_fp32_params, xnn_qu8_requantize_fp32);
+ }
+ }
+#endif // XNN_ARCH_ARM && !XNN_PLATFORM_IOS || XNN_ARCH_ARM64
+
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_IGEMM_MINMAX_FP32_1X4C2__SSE2_LD64, k_eq_8) {
TEST_REQUIRES_X86_SSE2;
diff --git a/test/qu8-igemm-minmax-fp32.yaml b/test/qu8-igemm-minmax-fp32.yaml
index 43dae10..d5ad131 100644
--- a/test/qu8-igemm-minmax-fp32.yaml
+++ b/test/qu8-igemm-minmax-fp32.yaml
@@ -2,7 +2,12 @@
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
-
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_1x8__neon_mlal_lane
+ init: xnn_init_qu8_conv_minmax_fp32_neon_params
+ k-block: 8
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_4x8__neon_mlal_lane
+ init: xnn_init_qu8_conv_minmax_fp32_neon_params
+ k-block: 8
- name: xnn_qu8_igemm_minmax_fp32_ukernel_1x16__neon_mlal_lane
init: xnn_init_qu8_conv_minmax_fp32_neon_params
k-block: 8
@@ -15,6 +20,21 @@
- name: xnn_qu8_igemm_minmax_fp32_ukernel_4x16__neonv8_mlal_lane
init: xnn_init_qu8_conv_minmax_fp32_neonv8_params
k-block: 8
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_cortex_a55
+ init: xnn_init_qu8_conv_minmax_fp32_neonv8_params
+ k-block: 16
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__aarch64_neondot_ld128
+ init: xnn_init_qu8_conv_minmax_fp32_neonv8_params
+ k-block: 16
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_1x16c4__neondot
+ init: xnn_init_qu8_conv_minmax_fp32_neonv8_params
+ k-block: 8
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_2x16c4__neondot
+ init: xnn_init_qu8_conv_minmax_fp32_neonv8_params
+ k-block: 8
+- name: xnn_qu8_igemm_minmax_fp32_ukernel_4x16c4__neondot
+ init: xnn_init_qu8_conv_minmax_fp32_neonv8_params
+ k-block: 8
- name: xnn_qu8_igemm_minmax_fp32_ukernel_1x4c2__sse2_ld64
init: xnn_init_qu8_conv_minmax_fp32_sse2_params
k-block: 8