Use FP32 requantization for extended-weights QS8 GEMM microkernels on x86
PiperOrigin-RevId: 389282162
diff --git a/BUILD.bazel b/BUILD.bazel
index adecd7e..0c9dd8f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -3637,27 +3637,27 @@
"src/qs8-gavgpool/gen/7x-minmax-sse2-c24-acc2.c",
"src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c",
"src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c",
- "src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c",
+ "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse2.c",
"src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
"src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c",
- "src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c",
+ "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse2.c",
"src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c",
"src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c",
- "src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c",
+ "src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse2.c",
"src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c",
"src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c",
"src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c",
- "src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c",
+ "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse2.c",
"src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c",
"src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c",
- "src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c",
+ "src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse2.c",
"src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c",
"src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c",
- "src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c",
+ "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse2.c",
"src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c",
"src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c",
"src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c",
- "src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c",
+ "src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse2.c",
"src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c",
"src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c",
"src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c",
@@ -3783,21 +3783,18 @@
"src/qs8-gavgpool/gen/7x-minmax-ssse3-c8-acc2.c",
"src/qs8-gavgpool/gen/7x-minmax-ssse3-c16-acc2.c",
"src/qs8-gavgpool/gen/7x-minmax-ssse3-c24-acc2.c",
- "src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c",
+ "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-ssse3.c",
"src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c",
"src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c",
- "src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c",
- "src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c",
+ "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-ssse3.c",
"src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c",
"src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c",
"src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c",
- "src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c",
- "src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c",
+ "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-ssse3.c",
"src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c",
"src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c",
- "src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c",
+ "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-ssse3.c",
"src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c",
- "src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c",
"src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c",
"src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c",
"src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c",
@@ -3975,27 +3972,27 @@
"src/qs8-gavgpool/gen/7x-minmax-sse41-c24-acc2.c",
"src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c",
"src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c",
- "src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c",
+ "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse41.c",
"src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
"src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c",
- "src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c",
+ "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse41.c",
"src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c",
"src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c",
- "src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c",
+ "src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse41.c",
"src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c",
"src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c",
"src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c",
- "src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c",
+ "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse41.c",
"src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c",
"src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c",
- "src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c",
+ "src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse41.c",
"src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c",
"src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c",
- "src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c",
+ "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse41.c",
"src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c",
"src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c",
"src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c",
- "src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c",
+ "src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse41.c",
"src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c",
"src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c",
"src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c",
@@ -4379,25 +4376,25 @@
"src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c",
"src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c",
"src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c",
- "src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c",
+ "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-avx.c",
"src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c",
"src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c",
- "src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c",
+ "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-avx.c",
"src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c",
"src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c",
- "src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c",
+ "src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-avx.c",
"src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c",
"src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c",
- "src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c",
+ "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-avx.c",
"src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c",
"src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c",
- "src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c",
+ "src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-avx.c",
"src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c",
"src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c",
- "src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c",
+ "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-avx.c",
"src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c",
"src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c",
- "src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c",
+ "src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-avx.c",
"src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c",
"src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c",
"src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c",
@@ -4564,25 +4561,25 @@
"src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c",
"src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c",
"src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c",
- "src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c",
+ "src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-xop.c",
"src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
"src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c",
- "src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c",
+ "src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-xop.c",
"src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c",
"src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c",
- "src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c",
+ "src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-xop.c",
"src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c",
"src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c",
- "src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c",
+ "src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-xop.c",
"src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c",
"src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c",
- "src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c",
+ "src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-xop.c",
"src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c",
"src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c",
- "src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c",
+ "src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-xop.c",
"src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c",
"src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c",
- "src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c",
+ "src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-xop.c",
"src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c",
"src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c",
"src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c",
@@ -4982,15 +4979,12 @@
"src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c",
"src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-avx2.c",
"src/qs8-gemm/gen/1x8c8-xw-minmax-fp32-avx2.c",
- "src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c",
"src/qs8-gemm/gen/2x8c8-minmax-fp32-avx2.c",
"src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-avx2.c",
"src/qs8-gemm/gen/2x8c8-xw-minmax-fp32-avx2.c",
- "src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c",
"src/qs8-gemm/gen/3x8c8-minmax-fp32-avx2.c",
"src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-avx2.c",
"src/qs8-gemm/gen/3x8c8-xw-minmax-fp32-avx2.c",
- "src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c",
"src/qs8-igemm/gen/1x8c8-minmax-fp32-avx2.c",
"src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-avx2.c",
"src/qs8-igemm/gen/2x8c8-minmax-fp32-avx2.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7f11dd7..6d1ce3a 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2802,27 +2802,27 @@
src/qs8-gavgpool/gen/7x-minmax-sse2-c24-acc2.c
src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
src/qs8-gemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
- src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
+ src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse2.c
src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
src/qs8-gemm/gen/1x4c8-minmax-fp32-sse2-ld128.c
- src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
+ src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse2.c
src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld64.c
src/qs8-gemm/gen/2x4c2-minmax-fp32-sse2-ld128.c
- src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
+ src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse2.c
src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld64.c
src/qs8-gemm/gen/2x4c8-minmax-fp32-sse2-ld128.c
src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
- src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
+ src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse2.c
src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld64.c
src/qs8-gemm/gen/3x4c2-minmax-fp32-sse2-ld128.c
- src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
+ src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse2.c
src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld64.c
src/qs8-gemm/gen/3x4c8-minmax-fp32-sse2-ld128.c
- src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
+ src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse2.c
src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld64.c
src/qs8-gemm/gen/4x4c2-minmax-fp32-sse2-ld128.c
src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse2-ld64.c
- src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
+ src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse2.c
src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld64.c
src/qs8-igemm/gen/1x4c2-minmax-fp32-sse2-ld128.c
src/qs8-igemm/gen/1x4c8-minmax-fp32-sse2-ld64.c
@@ -2946,21 +2946,18 @@
src/qs8-gavgpool/gen/7x-minmax-ssse3-c8-acc2.c
src/qs8-gavgpool/gen/7x-minmax-ssse3-c16-acc2.c
src/qs8-gavgpool/gen/7x-minmax-ssse3-c24-acc2.c
- src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
+ src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-ssse3.c
src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
src/qs8-gemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
- src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
- src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
+ src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-ssse3.c
src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
src/qs8-gemm/gen/2x4c8-minmax-fp32-ssse3-ld128.c
src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-ssse3-ld64.c
- src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
- src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
+ src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-ssse3.c
src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld64.c
src/qs8-gemm/gen/3x4c8-minmax-fp32-ssse3-ld128.c
- src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
+ src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-ssse3.c
src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-ssse3-ld64.c
- src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld64.c
src/qs8-igemm/gen/1x4c8-minmax-fp32-ssse3-ld128.c
src/qs8-igemm/gen/2x4c8-minmax-fp32-ssse3-ld64.c
@@ -3136,27 +3133,27 @@
src/qs8-gavgpool/gen/7x-minmax-sse41-c24-acc2.c
src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
src/qs8-gemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
- src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
+ src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse41.c
src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
src/qs8-gemm/gen/1x4c8-minmax-fp32-sse41-ld128.c
- src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
+ src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse41.c
src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld64.c
src/qs8-gemm/gen/2x4c2-minmax-fp32-sse41-ld128.c
- src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
+ src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse41.c
src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld64.c
src/qs8-gemm/gen/2x4c8-minmax-fp32-sse41-ld128.c
src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse41-ld64.c
- src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
+ src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse41.c
src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld64.c
src/qs8-gemm/gen/3x4c2-minmax-fp32-sse41-ld128.c
- src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
+ src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse41.c
src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld64.c
src/qs8-gemm/gen/3x4c8-minmax-fp32-sse41-ld128.c
- src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
+ src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse41.c
src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld64.c
src/qs8-gemm/gen/4x4c2-minmax-fp32-sse41-ld128.c
src/qs8-gemm/gen/4x4c2-minmax-gemmlowp-sse41-ld64.c
- src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
+ src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse41.c
src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld64.c
src/qs8-igemm/gen/1x4c2-minmax-fp32-sse41-ld128.c
src/qs8-igemm/gen/1x4c8-minmax-fp32-sse41-ld64.c
@@ -3536,25 +3533,25 @@
src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-avx-mul32.c
src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld64.c
src/qs8-gemm/gen/1x4c2-minmax-fp32-avx-ld128.c
- src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
+ src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-avx.c
src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld64.c
src/qs8-gemm/gen/1x4c8-minmax-fp32-avx-ld128.c
- src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
+ src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-avx.c
src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld64.c
src/qs8-gemm/gen/2x4c2-minmax-fp32-avx-ld128.c
- src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
+ src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-avx.c
src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld64.c
src/qs8-gemm/gen/2x4c8-minmax-fp32-avx-ld128.c
- src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
+ src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-avx.c
src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld64.c
src/qs8-gemm/gen/3x4c2-minmax-fp32-avx-ld128.c
- src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
+ src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-avx.c
src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld64.c
src/qs8-gemm/gen/3x4c8-minmax-fp32-avx-ld128.c
- src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
+ src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-avx.c
src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld64.c
src/qs8-gemm/gen/4x4c2-minmax-fp32-avx-ld128.c
- src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+ src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-avx.c
src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld64.c
src/qs8-igemm/gen/1x4c2-minmax-fp32-avx-ld128.c
src/qs8-igemm/gen/1x4c8-minmax-fp32-avx-ld64.c
@@ -3719,25 +3716,25 @@
src/qs8-dwconv/gen/up24x25-minmax-gemmlowp-xop-mul32.c
src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld64.c
src/qs8-gemm/gen/1x4c2-minmax-fp32-xop-ld128.c
- src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
+ src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-xop.c
src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld64.c
src/qs8-gemm/gen/1x4c8-minmax-fp32-xop-ld128.c
- src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
+ src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-xop.c
src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld64.c
src/qs8-gemm/gen/2x4c2-minmax-fp32-xop-ld128.c
- src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
+ src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-xop.c
src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld64.c
src/qs8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
- src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
+ src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-xop.c
src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld64.c
src/qs8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
- src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
+ src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-xop.c
src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld64.c
src/qs8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
- src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
+ src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-xop.c
src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld64.c
src/qs8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
- src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
+ src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-xop.c
src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld64.c
src/qs8-igemm/gen/1x4c2-minmax-fp32-xop-ld128.c
src/qs8-igemm/gen/1x4c8-minmax-fp32-xop-ld64.c
@@ -4135,15 +4132,12 @@
src/qs8-gemm/gen/1x8c8-minmax-fp32-avx2.c
src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-avx2.c
src/qs8-gemm/gen/1x8c8-xw-minmax-fp32-avx2.c
- src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c
src/qs8-gemm/gen/2x8c8-minmax-fp32-avx2.c
src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-avx2.c
src/qs8-gemm/gen/2x8c8-xw-minmax-fp32-avx2.c
- src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c
src/qs8-gemm/gen/3x8c8-minmax-fp32-avx2.c
src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-avx2.c
src/qs8-gemm/gen/3x8c8-xw-minmax-fp32-avx2.c
- src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c
src/qs8-igemm/gen/1x8c8-minmax-fp32-avx2.c
src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-avx2.c
src/qs8-igemm/gen/2x8c8-minmax-fp32-avx2.c
diff --git a/bench/qs8-gemm.cc b/bench/qs8-gemm.cc
index 2f15a3b..e560d6d 100644
--- a/bench/qs8-gemm.cc
+++ b/bench/qs8-gemm.cc
@@ -624,13 +624,13 @@
xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2);
}
- static void qs8_gemm_xw_2x8c8_gemmlowp__avx2(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, 2, 8, 8, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, benchmark::utils::CheckAVX2, true);
+ static void qs8_gemm_xw_2x8c8__avx2(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2, 2, 8, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2, true);
}
- static void qs8_gemm_xw_3x8c8_gemmlowp__avx2(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, 3, 8, 8, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, benchmark::utils::CheckAVX2, true);
+ static void qs8_gemm_xw_3x8c8__avx2(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2, 3, 8, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_avx2_params, benchmark::utils::CheckAVX2, true);
}
static void qs8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
@@ -659,17 +659,17 @@
xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
}
- static void qs8_gemm_xw_2x4c2_gemmlowp__xop(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__xop, 2, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP, true);
+ static void qs8_gemm_xw_2x4c2__xop(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
}
- static void qs8_gemm_xw_3x4c2_gemmlowp__xop(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__xop, 3, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP, true);
+ static void qs8_gemm_xw_3x4c2__xop(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
}
- static void qs8_gemm_xw_4x4c2_gemmlowp__xop(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, 4, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP, true);
+ static void qs8_gemm_xw_4x4c2__xop(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
}
static void qs8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
@@ -690,13 +690,13 @@
xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP);
}
- static void qs8_gemm_xw_2x4c8_gemmlowp__xop(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, 2, 4, 8, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP, true);
+ static void qs8_gemm_xw_2x4c8__xop(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
}
- static void qs8_gemm_xw_3x4c8_gemmlowp__xop(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, 3, 4, 8, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckXOP, true);
+ static void qs8_gemm_xw_3x4c8__xop(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckXOP, true);
}
static void qs8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
@@ -725,17 +725,17 @@
xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
}
- static void qs8_gemm_xw_2x4c2_gemmlowp__avx(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__avx, 2, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX, true);
+ static void qs8_gemm_xw_2x4c2__avx(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
}
- static void qs8_gemm_xw_3x4c2_gemmlowp__avx(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__avx, 3, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX, true);
+ static void qs8_gemm_xw_3x4c2__avx(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
}
- static void qs8_gemm_xw_4x4c2_gemmlowp__avx(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, 4, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX, true);
+ static void qs8_gemm_xw_4x4c2__avx(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
}
static void qs8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
@@ -756,13 +756,13 @@
xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX);
}
- static void qs8_gemm_xw_2x4c8_gemmlowp__avx(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, 2, 4, 8, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX, true);
+ static void qs8_gemm_xw_2x4c8__avx(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
}
- static void qs8_gemm_xw_3x4c8_gemmlowp__avx(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, 3, 4, 8, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckAVX, true);
+ static void qs8_gemm_xw_3x4c8__avx(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckAVX, true);
}
static void qs8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
@@ -791,17 +791,17 @@
xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
}
- static void qs8_gemm_xw_2x4c2_gemmlowp__sse41(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41, 2, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41, true);
+ static void qs8_gemm_xw_2x4c2__sse41(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
}
- static void qs8_gemm_xw_3x4c2_gemmlowp__sse41(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41, 3, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41, true);
+ static void qs8_gemm_xw_3x4c2__sse41(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
}
- static void qs8_gemm_xw_4x4c2_gemmlowp__sse41(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, 4, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41, true);
+ static void qs8_gemm_xw_4x4c2__sse41(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
}
static void qs8_gemm_2x4c8__sse41_ld64(benchmark::State& state, const char* net) {
@@ -822,26 +822,13 @@
xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41);
}
- static void qs8_gemm_xw_2x4c8_gemmlowp__sse41(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, 2, 4, 8, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41, true);
+ static void qs8_gemm_xw_2x4c8__sse41(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
}
- static void qs8_gemm_xw_3x4c8_gemmlowp__sse41(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, 3, 4, 8, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, benchmark::utils::CheckSSE41, true);
- }
-
- static void qs8_gemm_xw_2x4c2_gemmlowp__ssse3(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__ssse3, 2, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3, true);
- }
- static void qs8_gemm_xw_3x4c2_gemmlowp__ssse3(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__ssse3, 3, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3, true);
- }
- static void qs8_gemm_xw_4x4c2_gemmlowp__ssse3(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, 4, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3, true);
+ static void qs8_gemm_xw_3x4c8__sse41(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse4_params, benchmark::utils::CheckSSE41, true);
}
static void qs8_gemm_2x4c8__ssse3_ld64(benchmark::State& state, const char* net) {
@@ -862,13 +849,13 @@
xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3);
}
- static void qs8_gemm_xw_2x4c8_gemmlowp__ssse3(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, 2, 4, 8, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3, true);
+ static void qs8_gemm_xw_2x4c8__ssse3(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3, true);
}
- static void qs8_gemm_xw_3x4c8_gemmlowp__ssse3(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, 3, 4, 8, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, benchmark::utils::CheckSSSE3, true);
+ static void qs8_gemm_xw_3x4c8__ssse3(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, benchmark::utils::CheckSSSE3, true);
}
static void qs8_gemm_2x4c2__sse2_ld64(benchmark::State& state, const char* net) {
@@ -897,17 +884,17 @@
xnn_init_qs8_conv_minmax_fp32_sse2_params);
}
- static void qs8_gemm_xw_2x4c2_gemmlowp__sse2(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse2, 2, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, nullptr, true);
+ static void qs8_gemm_xw_2x4c2__sse2(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, 2, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
}
- static void qs8_gemm_xw_3x4c2_gemmlowp__sse2(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse2, 3, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, nullptr, true);
+ static void qs8_gemm_xw_3x4c2__sse2(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, 3, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
}
- static void qs8_gemm_xw_4x4c2_gemmlowp__sse2(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, 4, 4, 2, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, nullptr, true);
+ static void qs8_gemm_xw_4x4c2__sse2(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, 4, 4, 2, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
}
static void qs8_gemm_2x4c8__sse2_ld64(benchmark::State& state, const char* net) {
@@ -928,13 +915,13 @@
xnn_init_qs8_conv_minmax_fp32_sse2_params);
}
- static void qs8_gemm_xw_2x4c8_gemmlowp__sse2(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, 2, 4, 8, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, nullptr, true);
+ static void qs8_gemm_xw_2x4c8__sse2(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, 2, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
}
- static void qs8_gemm_xw_3x4c8_gemmlowp__sse2(benchmark::State& state, const char* net) {
- GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, 3, 4, 8, 1,
- xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, nullptr, true);
+ static void qs8_gemm_xw_3x4c8__sse2(benchmark::State& state, const char* net) {
+ GEMMBenchmark(state, xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, 3, 4, 8, 1,
+ xnn_init_qs8_conv_minmax_fp32_sse2_params, nullptr, true);
}
BENCHMARK_GEMM(qs8_gemm_2x16c8__avx512skx)
@@ -943,8 +930,8 @@
BENCHMARK_GEMM(qs8_gemm_2x8c8__avx2)
BENCHMARK_GEMM(qs8_gemm_3x8c8__avx2)
- BENCHMARK_GEMM(qs8_gemm_xw_2x8c8_gemmlowp__avx2)
- BENCHMARK_GEMM(qs8_gemm_xw_3x8c8_gemmlowp__avx2)
+ BENCHMARK_GEMM(qs8_gemm_xw_2x8c8__avx2)
+ BENCHMARK_GEMM(qs8_gemm_xw_3x8c8__avx2)
BENCHMARK_GEMM(qs8_gemm_2x4c2__xop_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c2__xop_ld64)
@@ -952,15 +939,15 @@
BENCHMARK_GEMM(qs8_gemm_2x4c2__xop_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c2__xop_ld128)
BENCHMARK_GEMM(qs8_gemm_4x4c2__xop_ld128)
- BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__xop)
- BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__xop)
- BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__xop)
+ BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__xop)
+ BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__xop)
+ BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__xop)
BENCHMARK_GEMM(qs8_gemm_2x4c8__xop_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c8__xop_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c8__xop_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c8__xop_ld128)
- BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__xop)
- BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__xop)
+ BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__xop)
+ BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__xop)
BENCHMARK_GEMM(qs8_gemm_2x4c2__avx_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c2__avx_ld64)
@@ -968,15 +955,15 @@
BENCHMARK_GEMM(qs8_gemm_2x4c2__avx_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c2__avx_ld128)
BENCHMARK_GEMM(qs8_gemm_4x4c2__avx_ld128)
- BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__avx)
- BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__avx)
- BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__avx)
+ BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__avx)
+ BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__avx)
+ BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__avx)
BENCHMARK_GEMM(qs8_gemm_2x4c8__avx_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c8__avx_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c8__avx_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c8__avx_ld128)
- BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__avx)
- BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__avx)
+ BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__avx)
+ BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__avx)
BENCHMARK_GEMM(qs8_gemm_2x4c2__sse41_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c2__sse41_ld64)
@@ -984,25 +971,22 @@
BENCHMARK_GEMM(qs8_gemm_2x4c2__sse41_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c2__sse41_ld128)
BENCHMARK_GEMM(qs8_gemm_4x4c2__sse41_ld128)
- BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__sse41)
- BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__sse41)
- BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__sse41)
+ BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__sse41)
+ BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__sse41)
+ BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__sse41)
BENCHMARK_GEMM(qs8_gemm_2x4c8__sse41_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c8__sse41_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c8__sse41_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c8__sse41_ld128)
- BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__sse41)
- BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__sse41)
+ BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__sse41)
+ BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__sse41)
- BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__ssse3)
- BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__ssse3)
- BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__ssse3)
BENCHMARK_GEMM(qs8_gemm_2x4c8__ssse3_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c8__ssse3_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c8__ssse3_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c8__ssse3_ld128)
- BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__ssse3)
- BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__ssse3)
+ BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__ssse3)
+ BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__ssse3)
BENCHMARK_GEMM(qs8_gemm_2x4c2__sse2_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c2__sse2_ld64)
@@ -1010,15 +994,15 @@
BENCHMARK_GEMM(qs8_gemm_2x4c2__sse2_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c2__sse2_ld128)
BENCHMARK_GEMM(qs8_gemm_4x4c2__sse2_ld128)
- BENCHMARK_GEMM(qs8_gemm_xw_2x4c2_gemmlowp__sse2)
- BENCHMARK_GEMM(qs8_gemm_xw_3x4c2_gemmlowp__sse2)
- BENCHMARK_GEMM(qs8_gemm_xw_4x4c2_gemmlowp__sse2)
+ BENCHMARK_GEMM(qs8_gemm_xw_2x4c2__sse2)
+ BENCHMARK_GEMM(qs8_gemm_xw_3x4c2__sse2)
+ BENCHMARK_GEMM(qs8_gemm_xw_4x4c2__sse2)
BENCHMARK_GEMM(qs8_gemm_2x4c8__sse2_ld64)
BENCHMARK_GEMM(qs8_gemm_3x4c8__sse2_ld64)
BENCHMARK_GEMM(qs8_gemm_2x4c8__sse2_ld128)
BENCHMARK_GEMM(qs8_gemm_3x4c8__sse2_ld128)
- BENCHMARK_GEMM(qs8_gemm_xw_2x4c8_gemmlowp__sse2)
- BENCHMARK_GEMM(qs8_gemm_xw_3x4c8_gemmlowp__sse2)
+ BENCHMARK_GEMM(qs8_gemm_xw_2x4c8__sse2)
+ BENCHMARK_GEMM(qs8_gemm_xw_3x4c8__sse2)
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/scripts/generate-qs8-gemm.sh b/scripts/generate-qs8-gemm.sh
index fef2379..a847184 100755
--- a/scripts/generate-qs8-gemm.sh
+++ b/scripts/generate-qs8-gemm.sh
@@ -512,30 +512,25 @@
tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QU8 -D REQUANTIZATION=FP32 -D VARIANT=LD128 -o src/qu8-gemm/gen/3x4c2-minmax-fp32-xop-ld128.c
tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QU8 -D REQUANTIZATION=FP32 -D VARIANT=LD128 -o src/qu8-gemm/gen/4x4c2-minmax-fp32-xop-ld128.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse2.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse2.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse2.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse2.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse41.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse41.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse41.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse41.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-avx.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-avx.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-avx.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-avx.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
-
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-xop.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-xop.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-xop.c
+tools/xngen src/qs8-gemm/MRx4c2-sse.c.in -D MR=4 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-xop.c
### C8 micro-kernels
tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=LD64 -o src/qs8-gemm/gen/2x4c8-minmax-gemmlowp-sse2-ld64.c
@@ -651,32 +646,28 @@
tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QU8 -D REQUANTIZATION=FP32 -D VARIANT=LD128 -o src/qu8-gemm/gen/2x4c8-minmax-fp32-xop-ld128.c
tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QU8 -D REQUANTIZATION=FP32 -D VARIANT=LD128 -o src/qu8-gemm/gen/3x4c8-minmax-fp32-xop-ld128.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse2.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse2.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse2.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-ssse3.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=3 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-ssse3.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse41.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse41.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse41.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-avx.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-avx.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=0 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-avx.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
-tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=1 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-xop.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=2 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-xop.c
+tools/xngen src/qs8-gemm/MRx4c8-sse.c.in -D MR=3 -D SSE=4 -D AVX=1 -D XOP=1 -D DATATYPE=QS8 -D REQUANTIZATION=FP32 -D VARIANT=EXTENDED -o src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-xop.c
################################### x86 AVX2 ##################################
### C8 micro-kernels
-tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=1 -D VARIANT=LD128 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-avx2.c
-tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D VARIANT=LD128 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-avx2.c
-tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D VARIANT=LD128 -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/3x8c8-minmax-gemmlowp-avx2.c
-
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=1 -D VARIANT=LD128 -D DATATYPE=QC8 -D REQUANTIZATION=FP32 -o src/qc8-gemm/gen/1x8c8-minmax-fp32-avx2.c
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D VARIANT=LD128 -D DATATYPE=QC8 -D REQUANTIZATION=FP32 -o src/qc8-gemm/gen/2x8c8-minmax-fp32-avx2.c
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D VARIANT=LD128 -D DATATYPE=QC8 -D REQUANTIZATION=FP32 -o src/qc8-gemm/gen/3x8c8-minmax-fp32-avx2.c
@@ -689,10 +680,6 @@
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D VARIANT=LD128 -D DATATYPE=QU8 -D REQUANTIZATION=FP32 -o src/qu8-gemm/gen/2x8c8-minmax-fp32-avx2.c
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D VARIANT=LD128 -D DATATYPE=QU8 -D REQUANTIZATION=FP32 -o src/qu8-gemm/gen/3x8c8-minmax-fp32-avx2.c
-tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=1 -D VARIANT=EXTENDED -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c
-tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D VARIANT=EXTENDED -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c
-tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D VARIANT=EXTENDED -D DATATYPE=QS8 -D REQUANTIZATION=GEMMLOWP -o src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c
-
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=1 -D VARIANT=EXTENDED -D DATATYPE=QC8 -D REQUANTIZATION=FP32 -o src/qc8-gemm/gen/1x8c8-xw-minmax-fp32-avx2.c
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=2 -D VARIANT=EXTENDED -D DATATYPE=QC8 -D REQUANTIZATION=FP32 -o src/qc8-gemm/gen/2x8c8-xw-minmax-fp32-avx2.c
tools/xngen src/qs8-gemm/MRx8c8-avx2.c.in -D MR=3 -D VARIANT=EXTENDED -D DATATYPE=QC8 -D REQUANTIZATION=FP32 -o src/qc8-gemm/gen/3x8c8-xw-minmax-fp32-avx2.c
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-avx.c
similarity index 73%
copy from src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
copy to src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-avx.c
index b649211..108c9b3 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-avx.c
@@ -16,7 +16,7 @@
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx(
size_t mr,
size_t nc,
size_t kc,
@@ -99,37 +99,21 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse2.c
similarity index 67%
copy from src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
copy to src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse2.c
index b649211..0d3fe35 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse2.c
@@ -9,14 +9,14 @@
#include <assert.h>
-#include <smmintrin.h>
+#include <emmintrin.h>
#include <xnnpack/gemm.h>
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2(
size_t mr,
size_t nc,
size_t kc,
@@ -48,7 +48,7 @@
size_t k = kc;
while (k >= 8 * sizeof(int8_t)) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
a0 += 8;
const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -73,7 +73,7 @@
}
if (k != 0) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
a0 = (const int8_t*) ((uintptr_t) a0 + k);
const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -99,37 +99,22 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
__m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+ vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
__m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
@@ -146,7 +131,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse41.c
similarity index 73%
rename from src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
rename to src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse41.c
index b649211..04b457d 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-sse41.c
@@ -16,7 +16,7 @@
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41(
size_t mr,
size_t nc,
size_t kc,
@@ -99,37 +99,21 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-ssse3.c
similarity index 100%
rename from src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-ssse3.c
rename to src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-ssse3.c
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-xop.c
similarity index 73%
rename from src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
rename to src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-xop.c
index 534ee1b..fcde93f 100644
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/1x4c2-xw-minmax-fp32-xop.c
@@ -21,7 +21,7 @@
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop(
size_t mr,
size_t nc,
size_t kc,
@@ -104,37 +104,21 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
deleted file mode 100644
index 0f03fdf..0000000
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,172 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c2-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 1);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 2);
- const int8_t* a0 = a;
- int8_t* c0 = c;
-
- do {
- __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = kc;
- while (k >= 8 * sizeof(int8_t)) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k -= 8 * sizeof(int8_t);
- }
- if (k != 0) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 = (const int8_t*) ((uintptr_t) a0 + k);
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
- if (k > 2 * sizeof(int8_t)) {
- const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
- if (k > 4 * sizeof(int8_t)) {
- const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- }
- }
- }
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
- const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
-
- const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
-
- const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-
- const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
- const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-
- const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-
- const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-
- const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
-
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
- vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
-
- __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
-
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_cvtsi128_si32(vout);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
deleted file mode 100644
index ed5fd68..0000000
--- a/src/qs8-gemm/gen/1x4c2-xw-minmax-gemmlowp-sse41.c
+++ /dev/null
@@ -1,155 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c2-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 1);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 2);
- const int8_t* a0 = a;
- int8_t* c0 = c;
-
- do {
- __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = kc;
- while (k >= 8 * sizeof(int8_t)) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
- a0 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k -= 8 * sizeof(int8_t);
- }
- if (k != 0) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
- a0 = (const int8_t*) ((uintptr_t) a0 + k);
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
- if (k > 2 * sizeof(int8_t)) {
- const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
- if (k > 4 * sizeof(int8_t)) {
- const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- }
- }
- }
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
- __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
-
-
- __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
-
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-avx.c
similarity index 68%
copy from src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-avx.c
index 8f5cae1..7b9e7f3 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-avx.c
@@ -15,7 +15,7 @@
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx(
size_t mr,
size_t nc,
size_t kc,
@@ -75,37 +75,21 @@
__m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse2.c
similarity index 60%
rename from src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
rename to src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse2.c
index a32f77c..593d736 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse2.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse2.c
@@ -15,7 +15,7 @@
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2(
size_t mr,
size_t nc,
size_t kc,
@@ -75,50 +75,18 @@
__m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
- const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
- const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
- const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-
- const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
- const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-
- const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-
- const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-
- const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
__m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
__m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse41.c
similarity index 68%
rename from src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
rename to src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse41.c
index 8f5cae1..4abd723 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-sse41.c
@@ -15,7 +15,7 @@
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41(
size_t mr,
size_t nc,
size_t kc,
@@ -75,37 +75,21 @@
__m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-ssse3.c
similarity index 62%
copy from src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-ssse3.c
index 8f5cae1..e2d6c95 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-ssse3.c
@@ -9,13 +9,13 @@
#include <assert.h>
-#include <smmintrin.h>
+#include <tmmintrin.h>
#include <xnnpack/gemm.h>
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3(
size_t mr,
size_t nc,
size_t kc,
@@ -50,7 +50,7 @@
size_t k = 0;
while (k < kc) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
a0 += 8;
const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -75,37 +75,22 @@
__m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
__m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+ vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
__m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
@@ -122,7 +107,7 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd.c
new file mode 100644
index 0000000..d1bda6a
--- /dev/null
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-wasmsimd.c
@@ -0,0 +1,138 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c8-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__wasmsimd(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const int8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8);
+ const int8_t* a0 = a;
+ int8_t* c0 = c;
+
+ const v128_t vzero = wasm_f64x2_splat(0.0);
+ do {
+ v128_t vacc0x0 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[0]);
+ v128_t vacc0x1 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[1]);
+ v128_t vacc0x2 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[2]);
+ v128_t vacc0x3 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[3]);
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t k = 0;
+ while (k < kc) {
+ const v128_t vxa0 = wasm_i16x8_load8x8(a0);
+ a0 += 8;
+
+ const v128_t vxb0 = wasm_v128_load(w);
+
+ const v128_t vprod0x0 = wasm_i16x8_mul(vxa0, vxb0);
+ vacc0x0 = wasm_i32x4_add(vacc0x0, wasm_i32x4_extend_low_i16x8(vprod0x0));
+ vacc0x0 = wasm_i32x4_add(vacc0x0, wasm_i32x4_extend_high_i16x8(vprod0x0));
+ const v128_t vxb1 = wasm_v128_load((const int16_t*) w + 8);
+
+ const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1);
+ vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_extend_low_i16x8(vprod0x1));
+ vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_extend_high_i16x8(vprod0x1));
+ const v128_t vxb2 = wasm_v128_load((const int16_t*) w + 16);
+
+ const v128_t vprod0x2 = wasm_i16x8_mul(vxa0, vxb2);
+ vacc0x2 = wasm_i32x4_add(vacc0x2, wasm_i32x4_extend_low_i16x8(vprod0x2));
+ vacc0x2 = wasm_i32x4_add(vacc0x2, wasm_i32x4_extend_high_i16x8(vprod0x2));
+ const v128_t vxb3 = wasm_v128_load((const int16_t*) w + 24);
+
+ const v128_t vprod0x3 = wasm_i16x8_mul(vxa0, vxb3);
+ vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_extend_low_i16x8(vprod0x3));
+ vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_extend_high_i16x8(vprod0x3));
+
+ w = (const void*) ((const int16_t*) w + 32);
+ k += 8 * sizeof(int8_t);
+ }
+
+ const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7));
+ const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7));
+
+ v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7));
+
+ const v128_t vsign0x0123 = wasm_i32x4_shr(vacc0x0123, 31);
+
+ const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5);
+
+ const v128_t vmultiplier = wasm_v128_load(params->gemmlowp_wasmsimd.multiplier);
+ const v128_t vrounding = wasm_v128_load(params->gemmlowp_wasmsimd.rounding);
+ const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding);
+ const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7);
+
+ const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding);
+
+ const v128_t vq31prod0x0123 = wasm_v32x4_shuffle(vprod0x01, vprod0x23, 1, 3, 5, 7);
+
+ const v128_t vremainder_mask = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_mask);
+ const v128_t vrem0x0123 = wasm_i32x4_add(wasm_v128_and(vq31prod0x0123, vremainder_mask), wasm_i32x4_shr(vq31prod0x0123, 31));
+
+ const v128_t vthreshold = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_threshold);
+ const int32_t vshift = params->gemmlowp_wasmsimd.shift;
+ vacc0x0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod0x0123, vshift), wasm_i32x4_gt(vrem0x0123, vthreshold));
+
+ const v128_t voutput_zero_point = wasm_v128_load(params->gemmlowp_wasmsimd.output_zero_point);
+ v128_t vacc00x0123 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0x0123, vacc0x0123), voutput_zero_point);
+
+ v128_t vout = wasm_i8x16_narrow_i16x8(vacc00x0123, vacc00x0123);
+
+ const v128_t voutput_min = wasm_v128_load(params->gemmlowp_wasmsimd.output_min);
+ vout = wasm_i8x16_max(vout, voutput_min);
+
+ const v128_t voutput_max = wasm_v128_load(params->gemmlowp_wasmsimd.output_max);
+ vout = wasm_i8x16_min(vout, voutput_max);
+
+ if (nc >= 4) {
+ *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) wasm_i16x8_extract_lane(vout, 0);
+ c0 += 2;
+ vout = wasm_u32x4_shr(vout, 16);
+ }
+ if (nc & 1) {
+ *c0 = (int8_t) wasm_i8x16_extract_lane(vout, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-xop.c
similarity index 68%
rename from src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
rename to src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-xop.c
index c85e7af..5bce735 100644
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/1x4c8-xw-minmax-fp32-xop.c
@@ -20,7 +20,7 @@
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop(
size_t mr,
size_t nc,
size_t kc,
@@ -80,37 +80,21 @@
__m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
deleted file mode 100644
index 27ac294..0000000
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-avx.c
+++ /dev/null
@@ -1,131 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c8-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 1);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
-
- do {
- __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
- __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
- __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
- __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = 0;
- while (k < kc) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
- a0 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k += 8 * sizeof(int8_t);
- }
-
- const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
- const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
-
- __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
-
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
- __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
-
-
- __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
-
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
deleted file mode 100644
index 0be1783..0000000
--- a/src/qs8-gemm/gen/1x4c8-xw-minmax-gemmlowp-ssse3.c
+++ /dev/null
@@ -1,148 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c8-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <tmmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 1);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
-
- do {
- __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
- __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
- __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
- __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = 0;
- while (k < kc) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k += 8 * sizeof(int8_t);
- }
-
- const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
- const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
-
- __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
- const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
-
- const __m128i vabsacc0x0123 = _mm_abs_epi32(vacc0x0123);
-
- const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
-
- const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
- const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
-
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
-
- const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
-
- const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
-
- const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
-
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
- vacc00x0123 = _mm_min_epi16(_mm_max_epi16(vacc00x0123, voutput_min), voutput_max);
-
- __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
-
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_cvtsi128_si32(vout);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c b/src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c
deleted file mode 100644
index 8b989f5..0000000
--- a/src/qs8-gemm/gen/1x8c8-xw-minmax-gemmlowp-avx2.c
+++ /dev/null
@@ -1,158 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx8c8-avx2.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/intrinsics-polyfill.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 1);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
-
- do {
- const __m128i vbias0x0 = _mm_loadu_si32(w);
- const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
- __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
- const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
- const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
- __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
- const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
- const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
- __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
- const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
- const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
- __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
- w = (const void*) ((const int32_t*) w + 8);
-
- size_t k = 0;
- while (k < kc) {
- const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
- const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
- a0 += 8;
-
- const __m256i vxb01 = _mm256_load_si256((const __m256i*) w);
-
- vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
- const __m256i vxb23 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 16));
-
- vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
- const __m256i vxb45 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 32));
-
- vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
- const __m256i vxb67 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 48));
-
- vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
-
- w = (const void*) ((const int16_t*) w + 64);
- k += 8 * sizeof(int8_t);
- }
-
- const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
- const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
-
- const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
-
- const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
- __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
-
- const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.rounding);
-
- const __m256i vacc0x11335577 = _mm256_srli_epi64(vacc0x01234567, 32);
-
- const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
-
- const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x11335577, vmultiplier), vrounding);
-
- const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
- const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
-
- const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
-
- const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.remainder_mask);
- const __m256i vrem0x01234567 =
- _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
-
- const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_avx2.shift);
- vacc0x01234567 =
- _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
-
- const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_zero_point);
- __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
-
- vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
-
- __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
-
- vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_min));
- vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_max));
-
- __m128i vout_lo = _mm256_castsi256_si128(vout);
- __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
-
- if (nc >= 8) {
- _mm_storel_epi64((__m128i*) c0, vout_lo);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
-
- nc -= 8;
- } else {
- if (nc & 4) {
- _mm_storeu_si32(c0, vout_lo);
-
- c0 += 4;
-
- vout_lo = _mm_srli_epi64(vout_lo, 32);
- vout_hi = _mm_srli_epi64(vout_hi, 32);
- }
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
-
- c0 += 2;
-
- vout_lo = _mm_srli_epi32(vout_lo, 16);
- vout_hi = _mm_srli_epi32(vout_hi, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-avx.c
similarity index 71%
copy from src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-avx.c
index 3520996..989b12e 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-avx.c
@@ -16,7 +16,7 @@
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx(
size_t mr,
size_t nc,
size_t kc,
@@ -126,47 +126,24 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse2.c
similarity index 65%
copy from src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse2.c
index 3520996..bcc6878 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse2.c
@@ -9,14 +9,14 @@
#include <assert.h>
-#include <smmintrin.h>
+#include <emmintrin.h>
#include <xnnpack/gemm.h>
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2(
size_t mr,
size_t nc,
size_t kc,
@@ -55,10 +55,10 @@
size_t k = kc;
while (k >= 8 * sizeof(int8_t)) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
a0 += 8;
const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+ const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
a1 += 8;
const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -91,10 +91,10 @@
}
if (k != 0) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
a0 = (const int8_t*) ((uintptr_t) a0 + k);
const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+ const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
a1 = (const int8_t*) ((uintptr_t) a1 + k);
const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -126,51 +126,30 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+ vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
@@ -188,8 +167,8 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
- *c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse41.c
similarity index 71%
rename from src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
rename to src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse41.c
index 3520996..41ea918 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-sse41.c
@@ -16,7 +16,7 @@
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41(
size_t mr,
size_t nc,
size_t kc,
@@ -126,47 +126,24 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-ssse3.c
similarity index 100%
rename from src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-ssse3.c
rename to src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-ssse3.c
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-xop.c
similarity index 71%
rename from src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
rename to src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-xop.c
index 6359db1..338a372 100644
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/2x4c2-xw-minmax-fp32-xop.c
@@ -21,7 +21,7 @@
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop(
size_t mr,
size_t nc,
size_t kc,
@@ -131,47 +131,24 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
deleted file mode 100644
index 80f4f6e..0000000
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-avx.c
+++ /dev/null
@@ -1,198 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c2-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__avx(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 2);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 2);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr != 2) {
- a1 = a0;
- c1 = c0;
- }
-
- do {
- __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
- __m128i vacc1x0123 = vacc0x0123;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = kc;
- while (k >= 8 * sizeof(int8_t)) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
- a1 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k -= 8 * sizeof(int8_t);
- }
- if (k != 0) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
- a0 = (const int8_t*) ((uintptr_t) a0 + k);
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
- a1 = (const int8_t*) ((uintptr_t) a1 + k);
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
- if (k > 2 * sizeof(int8_t)) {
- const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
- if (k > 4 * sizeof(int8_t)) {
- const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- }
- }
- }
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
-
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
- *c1 = (int8_t) _mm_extract_epi8(vout, 4);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
deleted file mode 100644
index 3ca4e3f..0000000
--- a/src/qs8-gemm/gen/2x4c2-xw-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,224 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c2-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse2(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 2);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 2);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr != 2) {
- a1 = a0;
- c1 = c0;
- }
-
- do {
- __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
- __m128i vacc1x0123 = vacc0x0123;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = kc;
- while (k >= 8 * sizeof(int8_t)) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
- a1 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k -= 8 * sizeof(int8_t);
- }
- if (k != 0) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 = (const int8_t*) ((uintptr_t) a0 + k);
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
- a1 = (const int8_t*) ((uintptr_t) a1 + k);
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
- if (k > 2 * sizeof(int8_t)) {
- const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
- if (k > 4 * sizeof(int8_t)) {
- const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- }
- }
- }
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
- const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
- const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
-
- const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
- const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
-
- const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
- const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
-
- const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
- const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
- const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
- const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
-
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
- const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
-
- const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
- const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
-
- const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
- const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
-
- const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
- vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
-
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- vout = _mm_srli_si128(vout, 4);
- *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_cvtsi128_si32(vout);
- *c1 = (int8_t) _mm_extract_epi16(vout, 2);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-avx.c
similarity index 65%
copy from src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
copy to src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-avx.c
index 573ab1c..6c6da41 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-avx.c
@@ -15,7 +15,7 @@
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx(
size_t mr,
size_t nc,
size_t kc,
@@ -95,47 +95,24 @@
__m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
__m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse2.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse2.c
new file mode 100644
index 0000000..97b1c93
--- /dev/null
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse2.c
@@ -0,0 +1,146 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c8-sse.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const int8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8);
+ const int8_t* a0 = a;
+ int8_t* c0 = c;
+ const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+
+ do {
+ __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+ __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+ __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+ __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+ __m128i vacc1x0 = vacc0x0;
+ __m128i vacc1x1 = vacc0x1;
+ __m128i vacc1x2 = vacc0x2;
+ __m128i vacc1x3 = vacc0x3;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t k = 0;
+ while (k < kc) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
+ a0 += 8;
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
+ a1 += 8;
+
+ const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
+
+ vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+ vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+ const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
+
+ vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+ vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+ const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
+
+ vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+ vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+ const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
+
+ vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+ vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+ w = (const void*) ((const int16_t*) w + 32);
+ k += 8 * sizeof(int8_t);
+ }
+
+ const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+ const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+ const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+ const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+
+ __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+ __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+ vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+
+ __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+
+ if (nc >= 4) {
+ *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+ a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+ c0 += 2;
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+ c1 += 2;
+ vout = _mm_srli_epi32(vout, 16);
+ }
+ if (nc & 1) {
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse41.c
similarity index 65%
rename from src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
rename to src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse41.c
index 573ab1c..78ca719 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-sse41.c
@@ -15,7 +15,7 @@
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41(
size_t mr,
size_t nc,
size_t kc,
@@ -95,47 +95,24 @@
__m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
__m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-ssse3.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-ssse3.c
new file mode 100644
index 0000000..1287f3d
--- /dev/null
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-ssse3.c
@@ -0,0 +1,146 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c8-sse.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <tmmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const int8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8);
+ const int8_t* a0 = a;
+ int8_t* c0 = c;
+ const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+
+ do {
+ __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+ __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+ __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+ __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+ __m128i vacc1x0 = vacc0x0;
+ __m128i vacc1x1 = vacc0x1;
+ __m128i vacc1x2 = vacc0x2;
+ __m128i vacc1x3 = vacc0x3;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t k = 0;
+ while (k < kc) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
+ a0 += 8;
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
+ a1 += 8;
+
+ const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
+
+ vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+ vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+ const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
+
+ vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+ vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+ const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
+
+ vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+ vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+ const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
+
+ vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+ vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+
+ w = (const void*) ((const int16_t*) w + 32);
+ k += 8 * sizeof(int8_t);
+ }
+
+ const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+ const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+ const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+ const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+
+ __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+ __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+ vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+
+ __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+
+
+ if (nc >= 4) {
+ *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+ a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+ c0 += 2;
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+ c1 += 2;
+ vout = _mm_srli_epi32(vout, 16);
+ }
+ if (nc & 1) {
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd.c
new file mode 100644
index 0000000..cada9f0
--- /dev/null
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-wasmsimd.c
@@ -0,0 +1,179 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c8-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__wasmsimd(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const int8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8);
+ const int8_t* a0 = a;
+ int8_t* c0 = c;
+ const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+
+ const v128_t vzero = wasm_f64x2_splat(0.0);
+ do {
+ v128_t vacc0x0 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[0]);
+ v128_t vacc0x1 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[1]);
+ v128_t vacc0x2 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[2]);
+ v128_t vacc0x3 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[3]);
+ v128_t vacc1x0 = vacc0x0;
+ v128_t vacc1x1 = vacc0x1;
+ v128_t vacc1x2 = vacc0x2;
+ v128_t vacc1x3 = vacc0x3;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t k = 0;
+ while (k < kc) {
+ const v128_t vxa0 = wasm_i16x8_load8x8(a0);
+ a0 += 8;
+ const v128_t vxa1 = wasm_i16x8_load8x8(a1);
+ a1 += 8;
+
+ const v128_t vxb0 = wasm_v128_load(w);
+
+ const v128_t vprod0x0 = wasm_i16x8_mul(vxa0, vxb0);
+ vacc0x0 = wasm_i32x4_add(vacc0x0, wasm_i32x4_extend_low_i16x8(vprod0x0));
+ vacc0x0 = wasm_i32x4_add(vacc0x0, wasm_i32x4_extend_high_i16x8(vprod0x0));
+ const v128_t vprod1x0 = wasm_i16x8_mul(vxa1, vxb0);
+ vacc1x0 = wasm_i32x4_add(vacc1x0, wasm_i32x4_extend_low_i16x8(vprod1x0));
+ vacc1x0 = wasm_i32x4_add(vacc1x0, wasm_i32x4_extend_high_i16x8(vprod1x0));
+ const v128_t vxb1 = wasm_v128_load((const int16_t*) w + 8);
+
+ const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1);
+ vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_extend_low_i16x8(vprod0x1));
+ vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_extend_high_i16x8(vprod0x1));
+ const v128_t vprod1x1 = wasm_i16x8_mul(vxa1, vxb1);
+ vacc1x1 = wasm_i32x4_add(vacc1x1, wasm_i32x4_extend_low_i16x8(vprod1x1));
+ vacc1x1 = wasm_i32x4_add(vacc1x1, wasm_i32x4_extend_high_i16x8(vprod1x1));
+ const v128_t vxb2 = wasm_v128_load((const int16_t*) w + 16);
+
+ const v128_t vprod0x2 = wasm_i16x8_mul(vxa0, vxb2);
+ vacc0x2 = wasm_i32x4_add(vacc0x2, wasm_i32x4_extend_low_i16x8(vprod0x2));
+ vacc0x2 = wasm_i32x4_add(vacc0x2, wasm_i32x4_extend_high_i16x8(vprod0x2));
+ const v128_t vprod1x2 = wasm_i16x8_mul(vxa1, vxb2);
+ vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_extend_low_i16x8(vprod1x2));
+ vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_extend_high_i16x8(vprod1x2));
+ const v128_t vxb3 = wasm_v128_load((const int16_t*) w + 24);
+
+ const v128_t vprod0x3 = wasm_i16x8_mul(vxa0, vxb3);
+ vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_extend_low_i16x8(vprod0x3));
+ vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_extend_high_i16x8(vprod0x3));
+ const v128_t vprod1x3 = wasm_i16x8_mul(vxa1, vxb3);
+ vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_extend_low_i16x8(vprod1x3));
+ vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_extend_high_i16x8(vprod1x3));
+
+ w = (const void*) ((const int16_t*) w + 32);
+ k += 8 * sizeof(int8_t);
+ }
+
+ const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7));
+ const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7));
+ const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7));
+ const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7));
+
+ v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7));
+ v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7));
+
+ const v128_t vsign0x0123 = wasm_i32x4_shr(vacc0x0123, 31);
+ const v128_t vsign1x0123 = wasm_i32x4_shr(vacc1x0123, 31);
+
+ const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5);
+ const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5);
+
+ const v128_t vmultiplier = wasm_v128_load(params->gemmlowp_wasmsimd.multiplier);
+ const v128_t vrounding = wasm_v128_load(params->gemmlowp_wasmsimd.rounding);
+ const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding);
+ const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7);
+ const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding);
+ const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7);
+
+ const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding);
+ const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding);
+
+ const v128_t vq31prod0x0123 = wasm_v32x4_shuffle(vprod0x01, vprod0x23, 1, 3, 5, 7);
+ const v128_t vq31prod1x0123 = wasm_v32x4_shuffle(vprod1x01, vprod1x23, 1, 3, 5, 7);
+
+ const v128_t vremainder_mask = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_mask);
+ const v128_t vrem0x0123 = wasm_i32x4_add(wasm_v128_and(vq31prod0x0123, vremainder_mask), wasm_i32x4_shr(vq31prod0x0123, 31));
+ const v128_t vrem1x0123 = wasm_i32x4_add(wasm_v128_and(vq31prod1x0123, vremainder_mask), wasm_i32x4_shr(vq31prod1x0123, 31));
+
+ const v128_t vthreshold = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_threshold);
+ const int32_t vshift = params->gemmlowp_wasmsimd.shift;
+ vacc0x0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod0x0123, vshift), wasm_i32x4_gt(vrem0x0123, vthreshold));
+ vacc1x0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod1x0123, vshift), wasm_i32x4_gt(vrem1x0123, vthreshold));
+
+ const v128_t voutput_zero_point = wasm_v128_load(params->gemmlowp_wasmsimd.output_zero_point);
+ v128_t vacc01x0123 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0x0123, vacc1x0123), voutput_zero_point);
+
+ v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc01x0123);
+
+ const v128_t voutput_min = wasm_v128_load(params->gemmlowp_wasmsimd.output_min);
+ vout = wasm_i8x16_max(vout, voutput_min);
+
+ const v128_t voutput_max = wasm_v128_load(params->gemmlowp_wasmsimd.output_max);
+ vout = wasm_i8x16_min(vout, voutput_max);
+
+ if (nc >= 4) {
+ *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
+ *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+ a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) wasm_i16x8_extract_lane(vout, 0);
+ c0 += 2;
+ *((uint16_t*) c1) = (uint16_t) wasm_i16x8_extract_lane(vout, 2);
+ c1 += 2;
+ vout = wasm_u32x4_shr(vout, 16);
+ }
+ if (nc & 1) {
+ *c0 = (int8_t) wasm_i8x16_extract_lane(vout, 0);
+ *c1 = (int8_t) wasm_i8x16_extract_lane(vout, 4);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-xop.c
similarity index 65%
rename from src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
rename to src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-xop.c
index 54a91c4..3221462 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/2x4c8-xw-minmax-fp32-xop.c
@@ -20,7 +20,7 @@
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop(
size_t mr,
size_t nc,
size_t kc,
@@ -100,47 +100,24 @@
__m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
__m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
deleted file mode 100644
index 31f4048..0000000
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,193 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c8-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 2);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr != 2) {
- a1 = a0;
- c1 = c0;
- }
-
- do {
- __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
- __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
- __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
- __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
- __m128i vacc1x0 = vacc0x0;
- __m128i vacc1x1 = vacc0x1;
- __m128i vacc1x2 = vacc0x2;
- __m128i vacc1x3 = vacc0x3;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = 0;
- while (k < kc) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
- a1 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
- vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
- vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
- vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k += 8 * sizeof(int8_t);
- }
-
- const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
- const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
- const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
- const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
-
- __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
- __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
- const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
- const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
-
- const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
- const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
-
- const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
- const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
-
- const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
- const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
- const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
- const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
-
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
- const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
-
- const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
- const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
-
- const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
- const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
-
- const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
- vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
-
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- vout = _mm_srli_si128(vout, 4);
- *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_cvtsi128_si32(vout);
- *c1 = (int8_t) _mm_extract_epi16(vout, 2);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
deleted file mode 100644
index a448b3f..0000000
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-sse41.c
+++ /dev/null
@@ -1,167 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c8-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 2);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr != 2) {
- a1 = a0;
- c1 = c0;
- }
-
- do {
- __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
- __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
- __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
- __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
- __m128i vacc1x0 = vacc0x0;
- __m128i vacc1x1 = vacc0x1;
- __m128i vacc1x2 = vacc0x2;
- __m128i vacc1x3 = vacc0x3;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = 0;
- while (k < kc) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
- a1 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
- vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
- vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
- vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k += 8 * sizeof(int8_t);
- }
-
- const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
- const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
- const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
- const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
-
- __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
- __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
-
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
-
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
- *c1 = (int8_t) _mm_extract_epi8(vout, 4);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
deleted file mode 100644
index 9e275fa..0000000
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-ssse3.c
+++ /dev/null
@@ -1,193 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c8-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <tmmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 2);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr != 2) {
- a1 = a0;
- c1 = c0;
- }
-
- do {
- __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
- __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
- __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
- __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
- __m128i vacc1x0 = vacc0x0;
- __m128i vacc1x1 = vacc0x1;
- __m128i vacc1x2 = vacc0x2;
- __m128i vacc1x3 = vacc0x3;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = 0;
- while (k < kc) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
- a1 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
- vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
- vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
- vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k += 8 * sizeof(int8_t);
- }
-
- const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
- const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
- const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
- const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
-
- __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
- __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
- const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
- const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
-
- const __m128i vabsacc0x0123 = _mm_abs_epi32(vacc0x0123);
- const __m128i vabsacc1x0123 = _mm_abs_epi32(vacc1x0123);
-
- const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
- const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
-
- const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
- const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
- const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
- const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
-
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
- const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
-
- const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
- const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
-
- const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
- const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
-
- const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
-
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
- vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
-
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- vout = _mm_srli_si128(vout, 4);
- *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_cvtsi128_si32(vout);
- *c1 = (int8_t) _mm_extract_epi16(vout, 2);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c b/src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c
deleted file mode 100644
index 684ea57..0000000
--- a/src/qs8-gemm/gen/2x8c8-xw-minmax-gemmlowp-avx2.c
+++ /dev/null
@@ -1,197 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx8c8-avx2.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/intrinsics-polyfill.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 2);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr != 2) {
- a1 = a0;
- c1 = c0;
- }
-
- do {
- const __m128i vbias0x0 = _mm_loadu_si32(w);
- const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
- __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
- const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
- const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
- __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
- const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
- const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
- __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
- const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
- const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
- __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
- __m256i vacc1x01 = vacc0x01;
- __m256i vacc1x23 = vacc0x23;
- __m256i vacc1x45 = vacc0x45;
- __m256i vacc1x67 = vacc0x67;
- w = (const void*) ((const int32_t*) w + 8);
-
- size_t k = 0;
- while (k < kc) {
- const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
- const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
- a0 += 8;
- const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
- const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
- a1 += 8;
-
- const __m256i vxb01 = _mm256_load_si256((const __m256i*) w);
-
- vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
- vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
- const __m256i vxb23 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 16));
-
- vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
- vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
- const __m256i vxb45 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 32));
-
- vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
- vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
- const __m256i vxb67 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 48));
-
- vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
- vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
-
- w = (const void*) ((const int16_t*) w + 64);
- k += 8 * sizeof(int8_t);
- }
-
- const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
- const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
- const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
- const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
-
- const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
- const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
-
- const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
- __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
- __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
-
- const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.rounding);
-
- const __m256i vacc0x11335577 = _mm256_srli_epi64(vacc0x01234567, 32);
- const __m256i vacc1x11335577 = _mm256_srli_epi64(vacc1x01234567, 32);
-
- const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
- const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
-
- const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x11335577, vmultiplier), vrounding);
- const __m256i vprod1x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x11335577, vmultiplier), vrounding);
-
- const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
- const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
- const __m256i vq31prod1x0246 = _mm256_srli_epi64(vprod1x0246, 31);
- const __m256i vq31prod1x1357 = _mm256_add_epi64(vprod1x1357, vprod1x1357);
-
- const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
- const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
-
- const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.remainder_mask);
- const __m256i vrem0x01234567 =
- _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
- const __m256i vrem1x01234567 =
- _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
-
- const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_avx2.shift);
- vacc0x01234567 =
- _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
- vacc1x01234567 =
- _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
-
- const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_zero_point);
- __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
-
- vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
-
- __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc01x01234567);
-
- vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_min));
- vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_max));
-
- __m128i vout_lo = _mm256_castsi256_si128(vout);
- __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
-
- if (nc >= 8) {
- _mm_storel_epi64((__m128i*) c0, vout_lo);
- _mm_storel_epi64((__m128i*) c1, vout_hi);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
-
- nc -= 8;
- } else {
- if (nc & 4) {
- _mm_storeu_si32(c0, vout_lo);
- _mm_storeu_si32(c1, vout_hi);
-
- c0 += 4;
- c1 += 4;
-
- vout_lo = _mm_srli_epi64(vout_lo, 32);
- vout_hi = _mm_srli_epi64(vout_hi, 32);
- }
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
-
- c0 += 2;
- c1 += 2;
-
- vout_lo = _mm_srli_epi32(vout_lo, 16);
- vout_hi = _mm_srli_epi32(vout_hi, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
- *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-avx.c
similarity index 71%
rename from src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
rename to src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-avx.c
index dfdcc16..a85b5ec 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-avx.c
@@ -16,7 +16,7 @@
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx(
size_t mr,
size_t nc,
size_t kc,
@@ -153,58 +153,28 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse2.c
similarity index 64%
copy from src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse2.c
index dfdcc16..b5680dc 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse2.c
@@ -9,14 +9,14 @@
#include <assert.h>
-#include <smmintrin.h>
+#include <emmintrin.h>
#include <xnnpack/gemm.h>
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2(
size_t mr,
size_t nc,
size_t kc,
@@ -62,13 +62,13 @@
size_t k = kc;
while (k >= 8 * sizeof(int8_t)) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
a0 += 8;
const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+ const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
a1 += 8;
const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+ const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
a2 += 8;
const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -109,13 +109,13 @@
}
if (k != 0) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
a0 = (const int8_t*) ((uintptr_t) a0 + k);
const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+ const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
a1 = (const int8_t*) ((uintptr_t) a1 + k);
const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+ const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
a2 = (const int8_t*) ((uintptr_t) a2 + k);
const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -153,63 +153,37 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+ vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+ vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
- *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
@@ -231,9 +205,9 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
- *c1 = (int8_t) _mm_extract_epi8(vout, 4);
- *c2 = (int8_t) _mm_extract_epi8(vout, 8);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse41.c
similarity index 71%
copy from src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse41.c
index dfdcc16..813025a 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-sse41.c
@@ -16,7 +16,7 @@
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41(
size_t mr,
size_t nc,
size_t kc,
@@ -153,58 +153,28 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-ssse3.c
similarity index 100%
rename from src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-ssse3.c
rename to src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-ssse3.c
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-xop.c
similarity index 70%
rename from src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
rename to src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-xop.c
index 3312851..55557ae 100644
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/3x4c2-xw-minmax-fp32-xop.c
@@ -21,7 +21,7 @@
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop(
size_t mr,
size_t nc,
size_t kc,
@@ -158,58 +158,28 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
deleted file mode 100644
index 84e896d..0000000
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-avx.c
+++ /dev/null
@@ -1,242 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c2-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__avx(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 3);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 2);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr < 2) {
- a1 = a0;
- c1 = c0;
- }
- const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
- int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
- if XNN_UNPREDICTABLE(mr <= 2) {
- a2 = a1;
- c2 = c1;
- }
-
- do {
- __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
- __m128i vacc1x0123 = vacc0x0123;
- __m128i vacc2x0123 = vacc0x0123;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = kc;
- while (k >= 8 * sizeof(int8_t)) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
- a1 += 8;
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
- a2 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k -= 8 * sizeof(int8_t);
- }
- if (k != 0) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
- a0 = (const int8_t*) ((uintptr_t) a0 + k);
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
- a1 = (const int8_t*) ((uintptr_t) a1 + k);
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
- a2 = (const int8_t*) ((uintptr_t) a2 + k);
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
- if (k > 2 * sizeof(int8_t)) {
- const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
- if (k > 4 * sizeof(int8_t)) {
- const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- }
- }
- }
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
-
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
- __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
-
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
-
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
- *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
- c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
- a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
- c2 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
- *c1 = (int8_t) _mm_extract_epi8(vout, 4);
- *c2 = (int8_t) _mm_extract_epi8(vout, 8);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
deleted file mode 100644
index ec7add0..0000000
--- a/src/qs8-gemm/gen/3x4c2-xw-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,278 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c2-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse2(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 3);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 2);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr < 2) {
- a1 = a0;
- c1 = c0;
- }
- const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
- int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
- if XNN_UNPREDICTABLE(mr <= 2) {
- a2 = a1;
- c2 = c1;
- }
-
- do {
- __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
- __m128i vacc1x0123 = vacc0x0123;
- __m128i vacc2x0123 = vacc0x0123;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = kc;
- while (k >= 8 * sizeof(int8_t)) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
- a1 += 8;
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
- a2 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k -= 8 * sizeof(int8_t);
- }
- if (k != 0) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 = (const int8_t*) ((uintptr_t) a0 + k);
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
- a1 = (const int8_t*) ((uintptr_t) a1 + k);
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
- a2 = (const int8_t*) ((uintptr_t) a2 + k);
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
- if (k > 2 * sizeof(int8_t)) {
- const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
- if (k > 4 * sizeof(int8_t)) {
- const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- }
- }
- }
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
- const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
- const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
- const __m128i vnmask2x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc2x0123);
-
- const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
- const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
- const __m128i vabsacc2x0123 = _mm_sub_epi32(_mm_xor_si128(vacc2x0123, vnmask2x0123), vnmask2x0123);
-
- const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vabsacc2x1133 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
- const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
- const __m128i vabsprod2x02 = _mm_mul_epu32(vabsacc2x0123, vmultiplier);
-
- const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask2x02 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
- const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
- const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
- const __m128i vprod2x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x02, vnmask2x02), vnmask2x02);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
- const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
- const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
-
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
- const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
- const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1133, vmultiplier);
-
- const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask2x13 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
- const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
- const __m128i vprod2x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x13, vnmask2x13), vnmask2x13);
-
- const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
- const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
- const __m128i vq31prod2x13 = _mm_srli_epi64(_mm_add_epi64(vprod2x13, vrounding), 31);
-
- const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod2x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod2x02), _mm_castsi128_ps(vq31prod2x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod2x0123 = _mm_shuffle_epi32(vq31prod2x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
- __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
-
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
- vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
- vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
-
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- vout = _mm_srli_si128(vout, 4);
- *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
- vout = _mm_srli_si128(vout, 4);
- *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
- c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
- a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
- c2 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_cvtsi128_si32(vout);
- *c1 = (int8_t) _mm_extract_epi16(vout, 2);
- *c2 = (int8_t) _mm_extract_epi16(vout, 4);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-avx.c
similarity index 65%
rename from src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
rename to src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-avx.c
index fc8a9ec..bb698ba 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-avx.c
@@ -15,7 +15,7 @@
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx(
size_t mr,
size_t nc,
size_t kc,
@@ -115,58 +115,28 @@
__m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
__m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse2.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse2.c
new file mode 100644
index 0000000..7a167e1
--- /dev/null
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse2.c
@@ -0,0 +1,178 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c8-sse.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const int8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 3);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8);
+ const int8_t* a0 = a;
+ int8_t* c0 = c;
+ const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+
+ do {
+ __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+ __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+ __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+ __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+ __m128i vacc1x0 = vacc0x0;
+ __m128i vacc1x1 = vacc0x1;
+ __m128i vacc1x2 = vacc0x2;
+ __m128i vacc1x3 = vacc0x3;
+ __m128i vacc2x0 = vacc0x0;
+ __m128i vacc2x1 = vacc0x1;
+ __m128i vacc2x2 = vacc0x2;
+ __m128i vacc2x3 = vacc0x3;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t k = 0;
+ while (k < kc) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
+ a0 += 8;
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
+ a1 += 8;
+ const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+ const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
+ a2 += 8;
+
+ const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
+
+ vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+ vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+ vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+ const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
+
+ vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+ vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+ vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+ const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
+
+ vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+ vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+ vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+ const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
+
+ vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+ vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+ vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+ w = (const void*) ((const int16_t*) w + 32);
+ k += 8 * sizeof(int8_t);
+ }
+
+ const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
+ const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
+ const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
+ const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
+ const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
+ const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
+
+ __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
+ __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
+ __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
+
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
+
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+ __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+ vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+ vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
+
+ __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+
+ if (nc >= 4) {
+ *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+ a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+ a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+ c0 += 2;
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+ c1 += 2;
+ *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+ c2 += 2;
+ vout = _mm_srli_epi32(vout, 16);
+ }
+ if (nc & 1) {
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse41.c
similarity index 65%
copy from src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
copy to src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse41.c
index fc8a9ec..82ce7ce 100644
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse41.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-sse41.c
@@ -15,7 +15,7 @@
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41(
size_t mr,
size_t nc,
size_t kc,
@@ -115,58 +115,28 @@
__m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
__m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-ssse3.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-ssse3.c
new file mode 100644
index 0000000..e0403c7
--- /dev/null
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-ssse3.c
@@ -0,0 +1,178 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c8-sse.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <tmmintrin.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const int8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 3);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8);
+ const int8_t* a0 = a;
+ int8_t* c0 = c;
+ const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+
+ do {
+ __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
+ __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
+ __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
+ __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
+ __m128i vacc1x0 = vacc0x0;
+ __m128i vacc1x1 = vacc0x1;
+ __m128i vacc1x2 = vacc0x2;
+ __m128i vacc1x3 = vacc0x3;
+ __m128i vacc2x0 = vacc0x0;
+ __m128i vacc2x1 = vacc0x1;
+ __m128i vacc2x2 = vacc0x2;
+ __m128i vacc2x3 = vacc0x3;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t k = 0;
+ while (k < kc) {
+ const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
+ a0 += 8;
+ const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
+ const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
+ a1 += 8;
+ const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+ const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
+ a2 += 8;
+
+ const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
+
+ vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
+ vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
+ vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
+ const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
+
+ vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
+ vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
+ vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
+ const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
+
+ vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
+ vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
+ vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
+ const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
+
+ vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
+ vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
+ vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
+
+ w = (const void*) ((const int16_t*) w + 32);
+ k += 8 * sizeof(int8_t);
+ }
+
+ const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
+ const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
+ const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
+ const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+ const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+ const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
+
+ __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
+ __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+ __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
+
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
+
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
+ __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+ __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+ vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+ vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
+
+ __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
+
+
+ if (nc >= 4) {
+ *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+ a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+ a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
+ c0 += 2;
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
+ c1 += 2;
+ *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+ c2 += 2;
+ vout = _mm_srli_epi32(vout, 16);
+ }
+ if (nc & 1) {
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd.c
new file mode 100644
index 0000000..f9b0a8e
--- /dev/null
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-wasmsimd.c
@@ -0,0 +1,221 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx4c8-wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__wasmsimd(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const int8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ assert(mr != 0);
+ assert(mr <= 3);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ kc = round_up_po2(kc, 8);
+ const int8_t* a0 = a;
+ int8_t* c0 = c;
+ const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+
+ const v128_t vzero = wasm_f64x2_splat(0.0);
+ do {
+ v128_t vacc0x0 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[0]);
+ v128_t vacc0x1 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[1]);
+ v128_t vacc0x2 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[2]);
+ v128_t vacc0x3 = wasm_f32x4_replace_lane(vzero, 0, ((const float*) w)[3]);
+ v128_t vacc1x0 = vacc0x0;
+ v128_t vacc1x1 = vacc0x1;
+ v128_t vacc1x2 = vacc0x2;
+ v128_t vacc1x3 = vacc0x3;
+ v128_t vacc2x0 = vacc0x0;
+ v128_t vacc2x1 = vacc0x1;
+ v128_t vacc2x2 = vacc0x2;
+ v128_t vacc2x3 = vacc0x3;
+ w = (const void*) ((const int32_t*) w + 4);
+
+ size_t k = 0;
+ while (k < kc) {
+ const v128_t vxa0 = wasm_i16x8_load8x8(a0);
+ a0 += 8;
+ const v128_t vxa1 = wasm_i16x8_load8x8(a1);
+ a1 += 8;
+ const v128_t vxa2 = wasm_i16x8_load8x8(a2);
+ a2 += 8;
+
+ const v128_t vxb0 = wasm_v128_load(w);
+
+ const v128_t vprod0x0 = wasm_i16x8_mul(vxa0, vxb0);
+ vacc0x0 = wasm_i32x4_add(vacc0x0, wasm_i32x4_extend_low_i16x8(vprod0x0));
+ vacc0x0 = wasm_i32x4_add(vacc0x0, wasm_i32x4_extend_high_i16x8(vprod0x0));
+ const v128_t vprod1x0 = wasm_i16x8_mul(vxa1, vxb0);
+ vacc1x0 = wasm_i32x4_add(vacc1x0, wasm_i32x4_extend_low_i16x8(vprod1x0));
+ vacc1x0 = wasm_i32x4_add(vacc1x0, wasm_i32x4_extend_high_i16x8(vprod1x0));
+ const v128_t vprod2x0 = wasm_i16x8_mul(vxa2, vxb0);
+ vacc2x0 = wasm_i32x4_add(vacc2x0, wasm_i32x4_extend_low_i16x8(vprod2x0));
+ vacc2x0 = wasm_i32x4_add(vacc2x0, wasm_i32x4_extend_high_i16x8(vprod2x0));
+ const v128_t vxb1 = wasm_v128_load((const int16_t*) w + 8);
+
+ const v128_t vprod0x1 = wasm_i16x8_mul(vxa0, vxb1);
+ vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_extend_low_i16x8(vprod0x1));
+ vacc0x1 = wasm_i32x4_add(vacc0x1, wasm_i32x4_extend_high_i16x8(vprod0x1));
+ const v128_t vprod1x1 = wasm_i16x8_mul(vxa1, vxb1);
+ vacc1x1 = wasm_i32x4_add(vacc1x1, wasm_i32x4_extend_low_i16x8(vprod1x1));
+ vacc1x1 = wasm_i32x4_add(vacc1x1, wasm_i32x4_extend_high_i16x8(vprod1x1));
+ const v128_t vprod2x1 = wasm_i16x8_mul(vxa2, vxb1);
+ vacc2x1 = wasm_i32x4_add(vacc2x1, wasm_i32x4_extend_low_i16x8(vprod2x1));
+ vacc2x1 = wasm_i32x4_add(vacc2x1, wasm_i32x4_extend_high_i16x8(vprod2x1));
+ const v128_t vxb2 = wasm_v128_load((const int16_t*) w + 16);
+
+ const v128_t vprod0x2 = wasm_i16x8_mul(vxa0, vxb2);
+ vacc0x2 = wasm_i32x4_add(vacc0x2, wasm_i32x4_extend_low_i16x8(vprod0x2));
+ vacc0x2 = wasm_i32x4_add(vacc0x2, wasm_i32x4_extend_high_i16x8(vprod0x2));
+ const v128_t vprod1x2 = wasm_i16x8_mul(vxa1, vxb2);
+ vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_extend_low_i16x8(vprod1x2));
+ vacc1x2 = wasm_i32x4_add(vacc1x2, wasm_i32x4_extend_high_i16x8(vprod1x2));
+ const v128_t vprod2x2 = wasm_i16x8_mul(vxa2, vxb2);
+ vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_extend_low_i16x8(vprod2x2));
+ vacc2x2 = wasm_i32x4_add(vacc2x2, wasm_i32x4_extend_high_i16x8(vprod2x2));
+ const v128_t vxb3 = wasm_v128_load((const int16_t*) w + 24);
+
+ const v128_t vprod0x3 = wasm_i16x8_mul(vxa0, vxb3);
+ vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_extend_low_i16x8(vprod0x3));
+ vacc0x3 = wasm_i32x4_add(vacc0x3, wasm_i32x4_extend_high_i16x8(vprod0x3));
+ const v128_t vprod1x3 = wasm_i16x8_mul(vxa1, vxb3);
+ vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_extend_low_i16x8(vprod1x3));
+ vacc1x3 = wasm_i32x4_add(vacc1x3, wasm_i32x4_extend_high_i16x8(vprod1x3));
+ const v128_t vprod2x3 = wasm_i16x8_mul(vxa2, vxb3);
+ vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_extend_low_i16x8(vprod2x3));
+ vacc2x3 = wasm_i32x4_add(vacc2x3, wasm_i32x4_extend_high_i16x8(vprod2x3));
+
+ w = (const void*) ((const int16_t*) w + 32);
+ k += 8 * sizeof(int8_t);
+ }
+
+ const v128_t vacc0x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x0, vacc0x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x0, vacc0x2, 2, 6, 3, 7));
+ const v128_t vacc0x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x1, vacc0x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x1, vacc0x3, 2, 6, 3, 7));
+ const v128_t vacc1x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x0, vacc1x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x0, vacc1x2, 2, 6, 3, 7));
+ const v128_t vacc1x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x1, vacc1x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x1, vacc1x3, 2, 6, 3, 7));
+ const v128_t vacc2x02 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x0, vacc2x2, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x0, vacc2x2, 2, 6, 3, 7));
+ const v128_t vacc2x13 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x1, vacc2x3, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x1, vacc2x3, 2, 6, 3, 7));
+
+ v128_t vacc0x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc0x02, vacc0x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc0x02, vacc0x13, 2, 6, 3, 7));
+ v128_t vacc1x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc1x02, vacc1x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc1x02, vacc1x13, 2, 6, 3, 7));
+ v128_t vacc2x0123 = wasm_i32x4_add(wasm_v32x4_shuffle(vacc2x02, vacc2x13, 0, 4, 1, 5), wasm_v32x4_shuffle(vacc2x02, vacc2x13, 2, 6, 3, 7));
+
+ const v128_t vsign0x0123 = wasm_i32x4_shr(vacc0x0123, 31);
+ const v128_t vsign1x0123 = wasm_i32x4_shr(vacc1x0123, 31);
+ const v128_t vsign2x0123 = wasm_i32x4_shr(vacc2x0123, 31);
+
+ const v128_t vacc0x01 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 0, 4, 1, 5);
+ const v128_t vacc1x01 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 0, 4, 1, 5);
+ const v128_t vacc2x01 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 0, 4, 1, 5);
+
+ const v128_t vmultiplier = wasm_v128_load(params->gemmlowp_wasmsimd.multiplier);
+ const v128_t vrounding = wasm_v128_load(params->gemmlowp_wasmsimd.rounding);
+ const v128_t vprod0x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x01, vmultiplier), vrounding);
+ const v128_t vacc0x23 = wasm_v32x4_shuffle(vacc0x0123, vsign0x0123, 2, 6, 3, 7);
+ const v128_t vprod1x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x01, vmultiplier), vrounding);
+ const v128_t vacc1x23 = wasm_v32x4_shuffle(vacc1x0123, vsign1x0123, 2, 6, 3, 7);
+ const v128_t vprod2x01 = wasm_i64x2_add(wasm_i64x2_mul(vacc2x01, vmultiplier), vrounding);
+ const v128_t vacc2x23 = wasm_v32x4_shuffle(vacc2x0123, vsign2x0123, 2, 6, 3, 7);
+
+ const v128_t vprod0x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc0x23, vmultiplier), vrounding);
+ const v128_t vprod1x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc1x23, vmultiplier), vrounding);
+ const v128_t vprod2x23 = wasm_i64x2_add(wasm_i64x2_mul(vacc2x23, vmultiplier), vrounding);
+
+ const v128_t vq31prod0x0123 = wasm_v32x4_shuffle(vprod0x01, vprod0x23, 1, 3, 5, 7);
+ const v128_t vq31prod1x0123 = wasm_v32x4_shuffle(vprod1x01, vprod1x23, 1, 3, 5, 7);
+ const v128_t vq31prod2x0123 = wasm_v32x4_shuffle(vprod2x01, vprod2x23, 1, 3, 5, 7);
+
+ const v128_t vremainder_mask = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_mask);
+ const v128_t vrem0x0123 = wasm_i32x4_add(wasm_v128_and(vq31prod0x0123, vremainder_mask), wasm_i32x4_shr(vq31prod0x0123, 31));
+ const v128_t vrem1x0123 = wasm_i32x4_add(wasm_v128_and(vq31prod1x0123, vremainder_mask), wasm_i32x4_shr(vq31prod1x0123, 31));
+ const v128_t vrem2x0123 = wasm_i32x4_add(wasm_v128_and(vq31prod2x0123, vremainder_mask), wasm_i32x4_shr(vq31prod2x0123, 31));
+
+ const v128_t vthreshold = wasm_v128_load(params->gemmlowp_wasmsimd.remainder_threshold);
+ const int32_t vshift = params->gemmlowp_wasmsimd.shift;
+ vacc0x0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod0x0123, vshift), wasm_i32x4_gt(vrem0x0123, vthreshold));
+ vacc1x0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod1x0123, vshift), wasm_i32x4_gt(vrem1x0123, vthreshold));
+ vacc2x0123 = wasm_i32x4_sub(wasm_i32x4_shr(vq31prod2x0123, vshift), wasm_i32x4_gt(vrem2x0123, vthreshold));
+
+ const v128_t voutput_zero_point = wasm_v128_load(params->gemmlowp_wasmsimd.output_zero_point);
+ v128_t vacc01x0123 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0x0123, vacc1x0123), voutput_zero_point);
+ v128_t vacc22x0123 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc2x0123, vacc2x0123), voutput_zero_point);
+
+ v128_t vout = wasm_i8x16_narrow_i16x8(vacc01x0123, vacc22x0123);
+
+ const v128_t voutput_min = wasm_v128_load(params->gemmlowp_wasmsimd.output_min);
+ vout = wasm_i8x16_max(vout, voutput_min);
+
+ const v128_t voutput_max = wasm_v128_load(params->gemmlowp_wasmsimd.output_max);
+ vout = wasm_i8x16_min(vout, voutput_max);
+
+ if (nc >= 4) {
+ *((float*) c0) = (float) wasm_f32x4_extract_lane(vout, 0);
+ *((float*) c1) = (float) wasm_f32x4_extract_lane(vout, 1);
+ *((float*) c2) = (float) wasm_f32x4_extract_lane(vout, 2);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+ a0 = (const int8_t*) ((uintptr_t) a0 - kc);
+ a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+ a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+
+ nc -= 4;
+ } else {
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) wasm_i16x8_extract_lane(vout, 0);
+ c0 += 2;
+ *((uint16_t*) c1) = (uint16_t) wasm_i16x8_extract_lane(vout, 2);
+ c1 += 2;
+ *((uint16_t*) c2) = (uint16_t) wasm_i16x8_extract_lane(vout, 4);
+ c2 += 2;
+ vout = wasm_u32x4_shr(vout, 16);
+ }
+ if (nc & 1) {
+ *c0 = (int8_t) wasm_i8x16_extract_lane(vout, 0);
+ *c1 = (int8_t) wasm_i8x16_extract_lane(vout, 4);
+ *c2 = (int8_t) wasm_i8x16_extract_lane(vout, 8);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-xop.c
similarity index 64%
copy from src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
copy to src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-xop.c
index 54a91c4..2719fb6 100644
--- a/src/qs8-gemm/gen/2x4c8-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/3x4c8-xw-minmax-fp32-xop.c
@@ -20,7 +20,7 @@
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop(
size_t mr,
size_t nc,
size_t kc,
@@ -33,7 +33,7 @@
const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
assert(mr != 0);
- assert(mr <= 2);
+ assert(mr <= 3);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(int8_t) == 0);
@@ -46,10 +46,16 @@
int8_t* c0 = c;
const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr != 2) {
+ if XNN_UNPREDICTABLE(mr < 2) {
a1 = a0;
c1 = c0;
}
+ const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
do {
__m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
@@ -60,6 +66,10 @@
__m128i vacc1x1 = vacc0x1;
__m128i vacc1x2 = vacc0x2;
__m128i vacc1x3 = vacc0x3;
+ __m128i vacc2x0 = vacc0x0;
+ __m128i vacc2x1 = vacc0x1;
+ __m128i vacc2x2 = vacc0x2;
+ __m128i vacc2x3 = vacc0x3;
w = (const void*) ((const int32_t*) w + 4);
size_t k = 0;
@@ -70,23 +80,30 @@
const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
a1 += 8;
+ const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
+ const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+ a2 += 8;
const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
+ vacc2x0 = _mm_maddd_epi16(vxa2, vxb0, vacc2x0);
const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
+ vacc2x1 = _mm_maddd_epi16(vxa2, vxb1, vacc2x1);
const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
+ vacc2x2 = _mm_maddd_epi16(vxa2, vxb2, vacc2x2);
const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
+ vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3);
w = (const void*) ((const int16_t*) w + 32);
k += 8 * sizeof(int8_t);
@@ -96,61 +113,48 @@
const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
+ const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
+ const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
__m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
__m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
+ __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
+ __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
+ __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
*((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
+ *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
a0 = (const int8_t*) ((uintptr_t) a0 - kc);
a1 = (const int8_t*) ((uintptr_t) a1 - kc);
+ a2 = (const int8_t*) ((uintptr_t) a2 - kc);
nc -= 4;
} else {
@@ -159,11 +163,14 @@
c0 += 2;
*((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
c1 += 2;
+ *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
+ c2 += 2;
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
*c0 = (int8_t) _mm_extract_epi8(vout, 0);
*c1 = (int8_t) _mm_extract_epi8(vout, 4);
+ *c2 = (int8_t) _mm_extract_epi8(vout, 8);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
deleted file mode 100644
index 0e1fd6b..0000000
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-avx.c
+++ /dev/null
@@ -1,204 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c8-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 3);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr < 2) {
- a1 = a0;
- c1 = c0;
- }
- const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
- int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
- if XNN_UNPREDICTABLE(mr <= 2) {
- a2 = a1;
- c2 = c1;
- }
-
- do {
- __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
- __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
- __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
- __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
- __m128i vacc1x0 = vacc0x0;
- __m128i vacc1x1 = vacc0x1;
- __m128i vacc1x2 = vacc0x2;
- __m128i vacc1x3 = vacc0x3;
- __m128i vacc2x0 = vacc0x0;
- __m128i vacc2x1 = vacc0x1;
- __m128i vacc2x2 = vacc0x2;
- __m128i vacc2x3 = vacc0x3;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = 0;
- while (k < kc) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
- a1 += 8;
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
- a2 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
- vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
- vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
- vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
- vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
- vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
- vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
- vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k += 8 * sizeof(int8_t);
- }
-
- const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
- const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
- const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
- const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
- const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
- const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
-
- __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
- __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
- __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
-
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
- __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
-
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
-
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
- *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
- c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
- a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
- c2 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
- *c1 = (int8_t) _mm_extract_epi8(vout, 4);
- *c2 = (int8_t) _mm_extract_epi8(vout, 8);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
deleted file mode 100644
index 2a4c9b1..0000000
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c8-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 3);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr < 2) {
- a1 = a0;
- c1 = c0;
- }
- const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
- int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
- if XNN_UNPREDICTABLE(mr <= 2) {
- a2 = a1;
- c2 = c1;
- }
-
- do {
- __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
- __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
- __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
- __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
- __m128i vacc1x0 = vacc0x0;
- __m128i vacc1x1 = vacc0x1;
- __m128i vacc1x2 = vacc0x2;
- __m128i vacc1x3 = vacc0x3;
- __m128i vacc2x0 = vacc0x0;
- __m128i vacc2x1 = vacc0x1;
- __m128i vacc2x2 = vacc0x2;
- __m128i vacc2x3 = vacc0x3;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = 0;
- while (k < kc) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
- a1 += 8;
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
- a2 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
- vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
- vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
- vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
- vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
- vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
- vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
- vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k += 8 * sizeof(int8_t);
- }
-
- const __m128i vacc0x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x0, vacc0x2), _mm_unpackhi_epi32(vacc0x0, vacc0x2));
- const __m128i vacc0x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x1, vacc0x3), _mm_unpackhi_epi32(vacc0x1, vacc0x3));
- const __m128i vacc1x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x0, vacc1x2), _mm_unpackhi_epi32(vacc1x0, vacc1x2));
- const __m128i vacc1x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x1, vacc1x3), _mm_unpackhi_epi32(vacc1x1, vacc1x3));
- const __m128i vacc2x02 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x0, vacc2x2), _mm_unpackhi_epi32(vacc2x0, vacc2x2));
- const __m128i vacc2x13 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x1, vacc2x3), _mm_unpackhi_epi32(vacc2x1, vacc2x3));
-
- __m128i vacc0x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc0x02, vacc0x13), _mm_unpackhi_epi32(vacc0x02, vacc0x13));
- __m128i vacc1x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc1x02, vacc1x13), _mm_unpackhi_epi32(vacc1x02, vacc1x13));
- __m128i vacc2x0123 = _mm_add_epi32(_mm_unpacklo_epi32(vacc2x02, vacc2x13), _mm_unpackhi_epi32(vacc2x02, vacc2x13));
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
- const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
- const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
- const __m128i vnmask2x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc2x0123);
-
- const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
- const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
- const __m128i vabsacc2x0123 = _mm_sub_epi32(_mm_xor_si128(vacc2x0123, vnmask2x0123), vnmask2x0123);
-
- const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vabsacc2x1133 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
- const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
- const __m128i vabsprod2x02 = _mm_mul_epu32(vabsacc2x0123, vmultiplier);
-
- const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask2x02 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
- const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
- const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
- const __m128i vprod2x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x02, vnmask2x02), vnmask2x02);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
- const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
- const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
-
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
- const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
- const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1133, vmultiplier);
-
- const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask2x13 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
- const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
- const __m128i vprod2x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x13, vnmask2x13), vnmask2x13);
-
- const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
- const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
- const __m128i vq31prod2x13 = _mm_srli_epi64(_mm_add_epi64(vprod2x13, vrounding), 31);
-
- const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod2x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod2x02), _mm_castsi128_ps(vq31prod2x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod2x0123 = _mm_shuffle_epi32(vq31prod2x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
- __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
-
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
- vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
- vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
-
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- vout = _mm_srli_si128(vout, 4);
- *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
- vout = _mm_srli_si128(vout, 4);
- *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
- c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
- a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
- c2 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_cvtsi128_si32(vout);
- *c1 = (int8_t) _mm_extract_epi16(vout, 2);
- *c2 = (int8_t) _mm_extract_epi16(vout, 4);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
deleted file mode 100644
index a7d781b..0000000
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-ssse3.c
+++ /dev/null
@@ -1,240 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c8-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <tmmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 3);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr < 2) {
- a1 = a0;
- c1 = c0;
- }
- const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
- int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
- if XNN_UNPREDICTABLE(mr <= 2) {
- a2 = a1;
- c2 = c1;
- }
-
- do {
- __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
- __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
- __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
- __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
- __m128i vacc1x0 = vacc0x0;
- __m128i vacc1x1 = vacc0x1;
- __m128i vacc1x2 = vacc0x2;
- __m128i vacc1x3 = vacc0x3;
- __m128i vacc2x0 = vacc0x0;
- __m128i vacc2x1 = vacc0x1;
- __m128i vacc2x2 = vacc0x2;
- __m128i vacc2x3 = vacc0x3;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = 0;
- while (k < kc) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
- a1 += 8;
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
- a2 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
- vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
- vacc2x0 = _mm_add_epi32(vacc2x0, _mm_madd_epi16(vxa2, vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
- vacc2x1 = _mm_add_epi32(vacc2x1, _mm_madd_epi16(vxa2, vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
- vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
- vacc2x2 = _mm_add_epi32(vacc2x2, _mm_madd_epi16(vxa2, vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
- vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
- vacc2x3 = _mm_add_epi32(vacc2x3, _mm_madd_epi16(vxa2, vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k += 8 * sizeof(int8_t);
- }
-
- const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
- const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
- const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
- const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
- const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
- const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
-
- __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
- __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
- __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
- const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
- const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
- const __m128i vnmask2x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc2x0123);
-
- const __m128i vabsacc0x0123 = _mm_abs_epi32(vacc0x0123);
- const __m128i vabsacc1x0123 = _mm_abs_epi32(vacc1x0123);
- const __m128i vabsacc2x0123 = _mm_abs_epi32(vacc2x0123);
-
- const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vabsacc2x1133 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
- const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
- const __m128i vabsprod2x02 = _mm_mul_epu32(vabsacc2x0123, vmultiplier);
-
- const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask2x02 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
- const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
- const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
- const __m128i vprod2x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x02, vnmask2x02), vnmask2x02);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
- const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
- const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
-
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
- const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
- const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1133, vmultiplier);
-
- const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask2x13 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
- const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
- const __m128i vprod2x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x13, vnmask2x13), vnmask2x13);
-
- const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
- const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
- const __m128i vq31prod2x13 = _mm_srli_epi64(_mm_add_epi64(vprod2x13, vrounding), 31);
-
- const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod2x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod2x02), _mm_castsi128_ps(vq31prod2x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod2x0123 = _mm_shuffle_epi32(vq31prod2x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
- __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
-
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
- vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
- vacc22x0123 = _mm_min_epi16(_mm_max_epi16(vacc22x0123, voutput_min), voutput_max);
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
-
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- vout = _mm_srli_si128(vout, 4);
- *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
- vout = _mm_srli_si128(vout, 4);
- *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
- c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
- a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
- c2 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_cvtsi128_si32(vout);
- *c1 = (int8_t) _mm_extract_epi16(vout, 2);
- *c2 = (int8_t) _mm_extract_epi16(vout, 4);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
deleted file mode 100644
index a54da33..0000000
--- a/src/qs8-gemm/gen/3x4c8-xw-minmax-gemmlowp-xop.c
+++ /dev/null
@@ -1,209 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c8-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#if defined(__GNUC__) || defined(__clang__)
- #include <x86intrin.h>
-#else
- #include <immintrin.h>
- #include <ammintrin.h>
-#endif
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 3);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr < 2) {
- a1 = a0;
- c1 = c0;
- }
- const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
- int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
- if XNN_UNPREDICTABLE(mr <= 2) {
- a2 = a1;
- c2 = c1;
- }
-
- do {
- __m128i vacc0x0 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[0]);
- __m128i vacc0x1 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[1]);
- __m128i vacc0x2 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[2]);
- __m128i vacc0x3 = _mm_cvtsi32_si128((int) ((const int32_t*) w)[3]);
- __m128i vacc1x0 = vacc0x0;
- __m128i vacc1x1 = vacc0x1;
- __m128i vacc1x2 = vacc0x2;
- __m128i vacc1x3 = vacc0x3;
- __m128i vacc2x0 = vacc0x0;
- __m128i vacc2x1 = vacc0x1;
- __m128i vacc2x2 = vacc0x2;
- __m128i vacc2x3 = vacc0x3;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = 0;
- while (k < kc) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
- a1 += 8;
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
- a2 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
- vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
- vacc2x0 = _mm_maddd_epi16(vxa2, vxb0, vacc2x0);
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
- vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
- vacc2x1 = _mm_maddd_epi16(vxa2, vxb1, vacc2x1);
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
- vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
- vacc2x2 = _mm_maddd_epi16(vxa2, vxb2, vacc2x2);
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
- vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
- vacc2x3 = _mm_maddd_epi16(vxa2, vxb3, vacc2x3);
-
- w = (const void*) ((const int16_t*) w + 32);
- k += 8 * sizeof(int8_t);
- }
-
- const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
- const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
- const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
- const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
- const __m128i vacc2x01 = _mm_hadd_epi32(vacc2x0, vacc2x1);
- const __m128i vacc2x23 = _mm_hadd_epi32(vacc2x2, vacc2x3);
-
- __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
- __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
- __m128i vacc2x0123 = _mm_hadd_epi32(vacc2x01, vacc2x23);
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
-
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
- __m128i vacc22x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc2x0123), voutput_zero_point);
-
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc22x0123);
-
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
- *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
- c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
- a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
- c2 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
- *c1 = (int8_t) _mm_extract_epi8(vout, 4);
- *c2 = (int8_t) _mm_extract_epi8(vout, 8);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c b/src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c
deleted file mode 100644
index 019873f..0000000
--- a/src/qs8-gemm/gen/3x8c8-xw-minmax-gemmlowp-avx2.c
+++ /dev/null
@@ -1,238 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx8c8-avx2.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <immintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/intrinsics-polyfill.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 3);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 8);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr < 2) {
- a1 = a0;
- c1 = c0;
- }
- const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
- int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
- if XNN_UNPREDICTABLE(mr <= 2) {
- a2 = a1;
- c2 = c1;
- }
-
- do {
- const __m128i vbias0x0 = _mm_loadu_si32(w);
- const __m128i vbias0x1 = _mm_loadu_si32((const int32_t*) w + 1);
- __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
- const __m128i vbias0x2 = _mm_loadu_si32((const int32_t*) w + 2);
- const __m128i vbias0x3 = _mm_loadu_si32((const int32_t*) w + 3);
- __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
- const __m128i vbias0x4 = _mm_loadu_si32((const int32_t*) w + 4);
- const __m128i vbias0x5 = _mm_loadu_si32((const int32_t*) w + 5);
- __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
- const __m128i vbias0x6 = _mm_loadu_si32((const int32_t*) w + 6);
- const __m128i vbias0x7 = _mm_loadu_si32((const int32_t*) w + 7);
- __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
- __m256i vacc1x01 = vacc0x01;
- __m256i vacc1x23 = vacc0x23;
- __m256i vacc1x45 = vacc0x45;
- __m256i vacc1x67 = vacc0x67;
- __m256i vacc2x01 = vacc0x01;
- __m256i vacc2x23 = vacc0x23;
- __m256i vacc2x45 = vacc0x45;
- __m256i vacc2x67 = vacc0x67;
- w = (const void*) ((const int32_t*) w + 8);
-
- size_t k = 0;
- while (k < kc) {
- const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
- const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
- a0 += 8;
- const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
- const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
- a1 += 8;
- const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
- const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
- a2 += 8;
-
- const __m256i vxb01 = _mm256_load_si256((const __m256i*) w);
-
- vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
- vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
- vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
- const __m256i vxb23 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 16));
-
- vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
- vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
- vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
- const __m256i vxb45 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 32));
-
- vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
- vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
- vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
- const __m256i vxb67 = _mm256_load_si256((const __m256i*) ((const int16_t*) w + 48));
-
- vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
- vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
- vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
-
- w = (const void*) ((const int16_t*) w + 64);
- k += 8 * sizeof(int8_t);
- }
-
- const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
- const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
- const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
- const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
- const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
- const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
-
- const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
- const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
- const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
-
- const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
- __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
- __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
- __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
-
- const __m256i vmultiplier = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.multiplier);
- const __m256i vrounding = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.rounding);
-
- const __m256i vacc0x11335577 = _mm256_srli_epi64(vacc0x01234567, 32);
- const __m256i vacc1x11335577 = _mm256_srli_epi64(vacc1x01234567, 32);
- const __m256i vacc2x11335577 = _mm256_srli_epi64(vacc2x01234567, 32);
-
- const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
- const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
- const __m256i vprod2x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc2x01234567, vmultiplier), vrounding);
-
- const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x11335577, vmultiplier), vrounding);
- const __m256i vprod1x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x11335577, vmultiplier), vrounding);
- const __m256i vprod2x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc2x11335577, vmultiplier), vrounding);
-
- const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
- const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
- const __m256i vq31prod1x0246 = _mm256_srli_epi64(vprod1x0246, 31);
- const __m256i vq31prod1x1357 = _mm256_add_epi64(vprod1x1357, vprod1x1357);
- const __m256i vq31prod2x0246 = _mm256_srli_epi64(vprod2x0246, 31);
- const __m256i vq31prod2x1357 = _mm256_add_epi64(vprod2x1357, vprod2x1357);
-
- const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
- const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
- const __m256i vq31prod2x01234567 = _mm256_blend_epi16(vq31prod2x0246, vq31prod2x1357, 0xCC);
-
- const __m256i vremainder_mask = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.remainder_mask);
- const __m256i vrem0x01234567 =
- _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
- const __m256i vrem1x01234567 =
- _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
- const __m256i vrem2x01234567 =
- _mm256_add_epi32(_mm256_and_si256(vq31prod2x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod2x01234567));
-
- const __m256i vremainder_threshold = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_avx2.shift);
- vacc0x01234567 =
- _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
- vacc1x01234567 =
- _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
- vacc2x01234567 =
- _mm256_sub_epi32(_mm256_sra_epi32(vq31prod2x01234567, vshift), _mm256_cmpgt_epi32(vrem2x01234567, vremainder_threshold));
-
- const __m256i voutput_zero_point = _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_zero_point);
- __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
- __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
-
- vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
- vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
-
- __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
-
- vout = _mm256_max_epi8(vout, _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_min));
- vout = _mm256_min_epi8(vout, _mm256_load_si256((const __m256i*) params->gemmlowp_avx2.output_max));
-
- __m128i vout_lo = _mm256_castsi256_si128(vout);
- __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
-
- if (nc >= 8) {
- _mm_storel_epi64((__m128i*) c0, vout_lo);
- _mm_storel_epi64((__m128i*) c1, vout_hi);
- _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
- c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
- a2 = (const int8_t*) ((uintptr_t) a2 - kc);
-
- nc -= 8;
- } else {
- if (nc & 4) {
- _mm_storeu_si32(c0, vout_lo);
- _mm_storeu_si32(c1, vout_hi);
- *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
-
- c0 += 4;
- c1 += 4;
- c2 += 4;
-
- vout_lo = _mm_srli_epi64(vout_lo, 32);
- vout_hi = _mm_srli_epi64(vout_hi, 32);
- }
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
- *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
-
- c0 += 2;
- c1 += 2;
- c2 += 2;
-
- vout_lo = _mm_srli_epi32(vout_lo, 16);
- vout_hi = _mm_srli_epi32(vout_hi, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
- *c1 = (int8_t) _mm_extract_epi8(vout_hi, 0);
- *c2 = (int8_t) _mm_extract_epi8(vout_lo, 8);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-avx.c
similarity index 70%
rename from src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
rename to src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-avx.c
index c33e288..ec55f91 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-avx.c
@@ -16,7 +16,7 @@
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx(
size_t mr,
size_t nc,
size_t kc,
@@ -180,68 +180,31 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+ __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc3x1133 = _mm_shuffle_epi32(vacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
+ vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
- const __m128i vprod3x02 = _mm_add_epi64(_mm_mul_epi32(vacc3x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+ vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
- const __m128i vprod3x13 = _mm_add_epi64(_mm_mul_epi32(vacc3x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
- const __m128i vq31prod3x02 = _mm_srli_epi64(vprod3x02, 31);
- const __m128i vq31prod3x13 = _mm_add_epi64(vprod3x13, vprod3x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
- const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
- const __m128i vrem3x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
- vacc3x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse2.c
similarity index 63%
copy from src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
copy to src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse2.c
index c33e288..2974662 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse2.c
@@ -9,14 +9,14 @@
#include <assert.h>
-#include <smmintrin.h>
+#include <emmintrin.h>
#include <xnnpack/gemm.h>
#include <xnnpack/math.h>
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2(
size_t mr,
size_t nc,
size_t kc,
@@ -69,16 +69,16 @@
size_t k = kc;
while (k >= 8 * sizeof(int8_t)) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
a0 += 8;
const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+ const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
a1 += 8;
const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+ const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
a2 += 8;
const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
- const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+ const __m128i vxa3 = _mm_srai_epi16(_mm_unpacklo_epi8(va3, va3), 8);
a3 += 8;
const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -127,16 +127,16 @@
}
if (k != 0) {
const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
+ const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
a0 = (const int8_t*) ((uintptr_t) a0 + k);
const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
+ const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
a1 = (const int8_t*) ((uintptr_t) a1 + k);
const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
+ const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
a2 = (const int8_t*) ((uintptr_t) a2 + k);
const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
- const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
+ const __m128i vxa3 = _mm_srai_epi16(_mm_unpacklo_epi8(va3, va3), 8);
a3 = (const int8_t*) ((uintptr_t) a3 + k);
const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
@@ -180,74 +180,42 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+ __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc3x1133 = _mm_shuffle_epi32(vacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
+ vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
- const __m128i vprod3x02 = _mm_add_epi64(_mm_mul_epi32(vacc3x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+ vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
- const __m128i vprod3x13 = _mm_add_epi64(_mm_mul_epi32(vacc3x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
- const __m128i vq31prod3x02 = _mm_srli_epi64(vprod3x02, 31);
- const __m128i vq31prod3x13 = _mm_add_epi64(vprod3x13, vprod3x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
- const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
- const __m128i vrem3x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
- vacc3x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
+ vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
+ vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
- *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
- *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
+ vout = _mm_srli_si128(vout, 4);
+ *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
@@ -273,10 +241,10 @@
vout = _mm_srli_epi32(vout, 16);
}
if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
- *c1 = (int8_t) _mm_extract_epi8(vout, 4);
- *c2 = (int8_t) _mm_extract_epi8(vout, 8);
- *c3 = (int8_t) _mm_extract_epi8(vout, 12);
+ *c0 = (int8_t) _mm_cvtsi128_si32(vout);
+ *c1 = (int8_t) _mm_extract_epi16(vout, 2);
+ *c2 = (int8_t) _mm_extract_epi16(vout, 4);
+ *c3 = (int8_t) _mm_extract_epi16(vout, 6);
}
nc = 0;
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse41.c
similarity index 70%
copy from src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
copy to src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse41.c
index c33e288..ca8b6d9 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-avx.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-sse41.c
@@ -16,7 +16,7 @@
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41(
size_t mr,
size_t nc,
size_t kc,
@@ -180,68 +180,31 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+ __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc3x1133 = _mm_shuffle_epi32(vacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
+ vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
- const __m128i vprod3x02 = _mm_add_epi64(_mm_mul_epi32(vacc3x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+ vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
- const __m128i vprod3x13 = _mm_add_epi64(_mm_mul_epi32(vacc3x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
- const __m128i vq31prod3x02 = _mm_srli_epi64(vprod3x02, 31);
- const __m128i vq31prod3x13 = _mm_add_epi64(vprod3x13, vprod3x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
- const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
- const __m128i vrem3x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
- vacc3x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-ssse3.c
similarity index 100%
rename from src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-ssse3.c
rename to src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-ssse3.c
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-xop.c
similarity index 70%
rename from src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
rename to src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-xop.c
index 462b459..943403e 100644
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-xop.c
+++ b/src/qs8-gemm/gen/4x4c2-xw-minmax-fp32-xop.c
@@ -21,7 +21,7 @@
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop(
+void xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop(
size_t mr,
size_t nc,
size_t kc,
@@ -185,68 +185,31 @@
}
}
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
+ __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
+ __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
+ __m128 vscaled2x0123 = _mm_cvtepi32_ps(vacc2x0123);
+ __m128 vscaled3x0123 = _mm_cvtepi32_ps(vacc3x0123);
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc3x1133 = _mm_shuffle_epi32(vacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
+ vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
+ vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
+ vscaled2x0123 = _mm_mul_ps(vscaled2x0123, vscale);
+ vscaled3x0123 = _mm_mul_ps(vscaled3x0123, vscale);
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
- const __m128i vprod3x02 = _mm_add_epi64(_mm_mul_epi32(vacc3x0123, vmultiplier), vrounding);
+ vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
+ vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
+ vacc2x0123 = _mm_cvtps_epi32(vscaled2x0123);
+ vacc3x0123 = _mm_cvtps_epi32(vscaled3x0123);
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
- const __m128i vprod3x13 = _mm_add_epi64(_mm_mul_epi32(vacc3x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
- const __m128i vq31prod3x02 = _mm_srli_epi64(vprod3x02, 31);
- const __m128i vq31prod3x13 = _mm_add_epi64(vprod3x13, vprod3x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
- const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
- const __m128i vrem3x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
- vacc3x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
__m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
__m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
__m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
+ vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
+ vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_max));
if (nc >= 4) {
*((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
deleted file mode 100644
index 488e22d..0000000
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse2.c
+++ /dev/null
@@ -1,330 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c2-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <emmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 4);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 2);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr < 2) {
- a1 = a0;
- c1 = c0;
- }
- const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
- int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
- if XNN_UNPREDICTABLE(mr <= 2) {
- a2 = a1;
- c2 = c1;
- }
- const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
- int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
- if XNN_UNPREDICTABLE(mr != 4) {
- a3 = a2;
- c3 = c2;
- }
-
- do {
- __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
- __m128i vacc1x0123 = vacc0x0123;
- __m128i vacc2x0123 = vacc0x0123;
- __m128i vacc3x0123 = vacc0x0123;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = kc;
- while (k >= 8 * sizeof(int8_t)) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
- a1 += 8;
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
- a2 += 8;
- const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
- const __m128i vxa3 = _mm_srai_epi16(_mm_unpacklo_epi8(va3, va3), 8);
- a3 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k -= 8 * sizeof(int8_t);
- }
- if (k != 0) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_srai_epi16(_mm_unpacklo_epi8(va0, va0), 8);
- a0 = (const int8_t*) ((uintptr_t) a0 + k);
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_srai_epi16(_mm_unpacklo_epi8(va1, va1), 8);
- a1 = (const int8_t*) ((uintptr_t) a1 + k);
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_srai_epi16(_mm_unpacklo_epi8(va2, va2), 8);
- a2 = (const int8_t*) ((uintptr_t) a2 + k);
- const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
- const __m128i vxa3 = _mm_srai_epi16(_mm_unpacklo_epi8(va3, va3), 8);
- a3 = (const int8_t*) ((uintptr_t) a3 + k);
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
- if (k > 2 * sizeof(int8_t)) {
- const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
- if (k > 4 * sizeof(int8_t)) {
- const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- }
- }
- }
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.rounding);
-
- const __m128i vnmask0x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0x0123);
- const __m128i vnmask1x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc1x0123);
- const __m128i vnmask2x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc2x0123);
- const __m128i vnmask3x0123 = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc3x0123);
-
- const __m128i vabsacc0x0123 = _mm_sub_epi32(_mm_xor_si128(vacc0x0123, vnmask0x0123), vnmask0x0123);
- const __m128i vabsacc1x0123 = _mm_sub_epi32(_mm_xor_si128(vacc1x0123, vnmask1x0123), vnmask1x0123);
- const __m128i vabsacc2x0123 = _mm_sub_epi32(_mm_xor_si128(vacc2x0123, vnmask2x0123), vnmask2x0123);
- const __m128i vabsacc3x0123 = _mm_sub_epi32(_mm_xor_si128(vacc3x0123, vnmask3x0123), vnmask3x0123);
-
- const __m128i vabsacc0x1133 = _mm_shuffle_epi32(vabsacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vabsacc1x1133 = _mm_shuffle_epi32(vabsacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vabsacc2x1133 = _mm_shuffle_epi32(vabsacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vabsacc3x1133 = _mm_shuffle_epi32(vabsacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vabsprod0x02 = _mm_mul_epu32(vabsacc0x0123, vmultiplier);
- const __m128i vabsprod1x02 = _mm_mul_epu32(vabsacc1x0123, vmultiplier);
- const __m128i vabsprod2x02 = _mm_mul_epu32(vabsacc2x0123, vmultiplier);
- const __m128i vabsprod3x02 = _mm_mul_epu32(vabsacc3x0123, vmultiplier);
-
- const __m128i vnmask0x02 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask1x02 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask2x02 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(2, 2, 0, 0));
- const __m128i vnmask3x02 = _mm_shuffle_epi32(vnmask3x0123, _MM_SHUFFLE(2, 2, 0, 0));
-
- const __m128i vprod0x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x02, vnmask0x02), vnmask0x02);
- const __m128i vprod1x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x02, vnmask1x02), vnmask1x02);
- const __m128i vprod2x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x02, vnmask2x02), vnmask2x02);
- const __m128i vprod3x02 = _mm_sub_epi64(_mm_xor_si128(vabsprod3x02, vnmask3x02), vnmask3x02);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(_mm_add_epi64(vprod0x02, vrounding), 31);
- const __m128i vq31prod1x02 = _mm_srli_epi64(_mm_add_epi64(vprod1x02, vrounding), 31);
- const __m128i vq31prod2x02 = _mm_srli_epi64(_mm_add_epi64(vprod2x02, vrounding), 31);
- const __m128i vq31prod3x02 = _mm_srli_epi64(_mm_add_epi64(vprod3x02, vrounding), 31);
-
- const __m128i vabsprod0x13 = _mm_mul_epu32(vabsacc0x1133, vmultiplier);
- const __m128i vabsprod1x13 = _mm_mul_epu32(vabsacc1x1133, vmultiplier);
- const __m128i vabsprod2x13 = _mm_mul_epu32(vabsacc2x1133, vmultiplier);
- const __m128i vabsprod3x13 = _mm_mul_epu32(vabsacc3x1133, vmultiplier);
-
- const __m128i vnmask0x13 = _mm_shuffle_epi32(vnmask0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask1x13 = _mm_shuffle_epi32(vnmask1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask2x13 = _mm_shuffle_epi32(vnmask2x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vnmask3x13 = _mm_shuffle_epi32(vnmask3x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod0x13, vnmask0x13), vnmask0x13);
- const __m128i vprod1x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod1x13, vnmask1x13), vnmask1x13);
- const __m128i vprod2x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod2x13, vnmask2x13), vnmask2x13);
- const __m128i vprod3x13 = _mm_sub_epi64(_mm_xor_si128(vabsprod3x13, vnmask3x13), vnmask3x13);
-
- const __m128i vq31prod0x13 = _mm_srli_epi64(_mm_add_epi64(vprod0x13, vrounding), 31);
- const __m128i vq31prod1x13 = _mm_srli_epi64(_mm_add_epi64(vprod1x13, vrounding), 31);
- const __m128i vq31prod2x13 = _mm_srli_epi64(_mm_add_epi64(vprod2x13, vrounding), 31);
- const __m128i vq31prod3x13 = _mm_srli_epi64(_mm_add_epi64(vprod3x13, vrounding), 31);
-
- const __m128i vq31prod0x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod0x02), _mm_castsi128_ps(vq31prod0x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod1x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod1x02), _mm_castsi128_ps(vq31prod1x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod2x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod2x02), _mm_castsi128_ps(vq31prod2x13), _MM_SHUFFLE(2, 0, 2, 0)));
- const __m128i vq31prod3x0213 = _mm_castps_si128(_mm_shuffle_ps(
- _mm_castsi128_ps(vq31prod3x02), _mm_castsi128_ps(vq31prod3x13), _MM_SHUFFLE(2, 0, 2, 0)));
-
- const __m128i vq31prod0x0123 = _mm_shuffle_epi32(vq31prod0x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod1x0123 = _mm_shuffle_epi32(vq31prod1x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod2x0123 = _mm_shuffle_epi32(vq31prod2x0213, _MM_SHUFFLE(3, 1, 2, 0));
- const __m128i vq31prod3x0123 = _mm_shuffle_epi32(vq31prod3x0213, _MM_SHUFFLE(3, 1, 2, 0));
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
- const __m128i vrem3x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse2.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
- vacc3x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
- __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
-
- const __m128i voutput_min = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_min);
- const __m128i voutput_max = _mm_load_si128((const __m128i*) params->gemmlowp_sse2.output_max);
- vacc01x0123 = _mm_min_epi16(_mm_max_epi16(vacc01x0123, voutput_min), voutput_max);
- vacc23x0123 = _mm_min_epi16(_mm_max_epi16(vacc23x0123, voutput_min), voutput_max);
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
-
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- vout = _mm_srli_si128(vout, 4);
- *((uint32_t*) c1) = (uint32_t) _mm_cvtsi128_si32(vout);
- vout = _mm_srli_si128(vout, 4);
- *((uint32_t*) c2) = (uint32_t) _mm_cvtsi128_si32(vout);
- vout = _mm_srli_si128(vout, 4);
- *((uint32_t*) c3) = (uint32_t) _mm_cvtsi128_si32(vout);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
- c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
- c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
- a2 = (const int8_t*) ((uintptr_t) a2 - kc);
- a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
- c2 += 2;
- *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
- c3 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_cvtsi128_si32(vout);
- *c1 = (int8_t) _mm_extract_epi16(vout, 2);
- *c2 = (int8_t) _mm_extract_epi16(vout, 4);
- *c3 = (int8_t) _mm_extract_epi16(vout, 6);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c b/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
deleted file mode 100644
index 6aaba0d..0000000
--- a/src/qs8-gemm/gen/4x4c2-xw-minmax-gemmlowp-sse41.c
+++ /dev/null
@@ -1,285 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/qs8-gemm/MRx4c2-sse.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <smmintrin.h>
-
-#include <xnnpack/gemm.h>
-#include <xnnpack/math.h>
-
-
-
-void xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41(
- size_t mr,
- size_t nc,
- size_t kc,
- const int8_t* restrict a,
- size_t a_stride,
- const void* restrict w,
- int8_t* restrict c,
- size_t cm_stride,
- size_t cn_stride,
- const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- assert(mr != 0);
- assert(mr <= 4);
- assert(nc != 0);
- assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
- assert(a != NULL);
- assert(w != NULL);
- assert(c != NULL);
-
- kc = round_up_po2(kc, 2);
- const int8_t* a0 = a;
- int8_t* c0 = c;
- const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
- int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
- if XNN_UNPREDICTABLE(mr < 2) {
- a1 = a0;
- c1 = c0;
- }
- const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
- int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
- if XNN_UNPREDICTABLE(mr <= 2) {
- a2 = a1;
- c2 = c1;
- }
- const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
- int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
- if XNN_UNPREDICTABLE(mr != 4) {
- a3 = a2;
- c3 = c2;
- }
-
- do {
- __m128i vacc0x0123 = _mm_loadu_si128((const __m128i*) w);
- __m128i vacc1x0123 = vacc0x0123;
- __m128i vacc2x0123 = vacc0x0123;
- __m128i vacc3x0123 = vacc0x0123;
- w = (const void*) ((const int32_t*) w + 4);
-
- size_t k = kc;
- while (k >= 8 * sizeof(int8_t)) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
- a0 += 8;
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
- a1 += 8;
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
- a2 += 8;
- const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
- const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
- a3 += 8;
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vxb1 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 8));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vxb2 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 16));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vxb3 = _mm_load_si128((const __m128i*) ((const int16_t*) w + 24));
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
-
- w = (const void*) ((const int16_t*) w + 32);
- k -= 8 * sizeof(int8_t);
- }
- if (k != 0) {
- const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
- const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
- a0 = (const int8_t*) ((uintptr_t) a0 + k);
- const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
- const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
- a1 = (const int8_t*) ((uintptr_t) a1 + k);
- const __m128i va2 = _mm_loadl_epi64((const __m128i*) a2);
- const __m128i vxa2 = _mm_cvtepi8_epi16(va2);
- a2 = (const int8_t*) ((uintptr_t) a2 + k);
- const __m128i va3 = _mm_loadl_epi64((const __m128i*) a3);
- const __m128i vxa3 = _mm_cvtepi8_epi16(va3);
- a3 = (const int8_t*) ((uintptr_t) a3 + k);
-
- const __m128i vxb0 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-
- if (k > 2 * sizeof(int8_t)) {
- const __m128i vxb1 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-
- if (k > 4 * sizeof(int8_t)) {
- const __m128i vxb2 = _mm_load_si128((const __m128i*) w);
- w = (const void*) ((const int16_t*) w + 8);
-
- vacc0x0123 = _mm_add_epi32(vacc0x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc1x0123 = _mm_add_epi32(vacc1x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa1, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc2x0123 = _mm_add_epi32(vacc2x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- vacc3x0123 = _mm_add_epi32(vacc3x0123,
- _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- }
- }
- }
-
- const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.multiplier);
- const __m128i vrounding = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.rounding);
-
- const __m128i vacc0x1133 = _mm_shuffle_epi32(vacc0x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc1x1133 = _mm_shuffle_epi32(vacc1x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc2x1133 = _mm_shuffle_epi32(vacc2x0123, _MM_SHUFFLE(3, 3, 1, 1));
- const __m128i vacc3x1133 = _mm_shuffle_epi32(vacc3x0123, _MM_SHUFFLE(3, 3, 1, 1));
-
- const __m128i vprod0x02 = _mm_add_epi64(_mm_mul_epi32(vacc0x0123, vmultiplier), vrounding);
- const __m128i vprod1x02 = _mm_add_epi64(_mm_mul_epi32(vacc1x0123, vmultiplier), vrounding);
- const __m128i vprod2x02 = _mm_add_epi64(_mm_mul_epi32(vacc2x0123, vmultiplier), vrounding);
- const __m128i vprod3x02 = _mm_add_epi64(_mm_mul_epi32(vacc3x0123, vmultiplier), vrounding);
-
- const __m128i vprod0x13 = _mm_add_epi64(_mm_mul_epi32(vacc0x1133, vmultiplier), vrounding);
- const __m128i vprod1x13 = _mm_add_epi64(_mm_mul_epi32(vacc1x1133, vmultiplier), vrounding);
- const __m128i vprod2x13 = _mm_add_epi64(_mm_mul_epi32(vacc2x1133, vmultiplier), vrounding);
- const __m128i vprod3x13 = _mm_add_epi64(_mm_mul_epi32(vacc3x1133, vmultiplier), vrounding);
-
- const __m128i vq31prod0x02 = _mm_srli_epi64(vprod0x02, 31);
- const __m128i vq31prod0x13 = _mm_add_epi64(vprod0x13, vprod0x13);
- const __m128i vq31prod1x02 = _mm_srli_epi64(vprod1x02, 31);
- const __m128i vq31prod1x13 = _mm_add_epi64(vprod1x13, vprod1x13);
- const __m128i vq31prod2x02 = _mm_srli_epi64(vprod2x02, 31);
- const __m128i vq31prod2x13 = _mm_add_epi64(vprod2x13, vprod2x13);
- const __m128i vq31prod3x02 = _mm_srli_epi64(vprod3x02, 31);
- const __m128i vq31prod3x13 = _mm_add_epi64(vprod3x13, vprod3x13);
-
- const __m128i vq31prod0x0123 = _mm_blend_epi16(vq31prod0x02, vq31prod0x13, 0xCC);
- const __m128i vq31prod1x0123 = _mm_blend_epi16(vq31prod1x02, vq31prod1x13, 0xCC);
- const __m128i vq31prod2x0123 = _mm_blend_epi16(vq31prod2x02, vq31prod2x13, 0xCC);
- const __m128i vq31prod3x0123 = _mm_blend_epi16(vq31prod3x02, vq31prod3x13, 0xCC);
-
- const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_mask);
- const __m128i vrem0x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod0x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod0x0123));
- const __m128i vrem1x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod1x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod1x0123));
- const __m128i vrem2x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod2x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod2x0123));
- const __m128i vrem3x0123 =
- _mm_add_epi32(_mm_and_si128(vq31prod3x0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vq31prod3x0123));
-
- const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.remainder_threshold);
- const __m128i vshift = _mm_loadl_epi64((const __m128i*) params->gemmlowp_sse4.shift);
- vacc0x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod0x0123, vshift), _mm_cmpgt_epi32(vrem0x0123, vremainder_threshold));
- vacc1x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod1x0123, vshift), _mm_cmpgt_epi32(vrem1x0123, vremainder_threshold));
- vacc2x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod2x0123, vshift), _mm_cmpgt_epi32(vrem2x0123, vremainder_threshold));
- vacc3x0123 =
- _mm_sub_epi32(_mm_sra_epi32(vq31prod3x0123, vshift), _mm_cmpgt_epi32(vrem3x0123, vremainder_threshold));
-
- const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_zero_point);
- __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
- __m128i vacc23x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc2x0123, vacc3x0123), voutput_zero_point);
-
-
- __m128i vout = _mm_packs_epi16(vacc01x0123, vacc23x0123);
-
- vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_min));
- vout = _mm_min_epi8(vout, _mm_load_si128((const __m128i*) params->gemmlowp_sse4.output_max));
-
- if (nc >= 4) {
- *((uint32_t*) c0) = (uint32_t) _mm_cvtsi128_si32(vout);
- *((uint32_t*) c1) = (uint32_t) _mm_extract_epi32(vout, 1);
- *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout, 2);
- *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout, 3);
-
- c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
- c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
- c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
- c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
-
- a0 = (const int8_t*) ((uintptr_t) a0 - kc);
- a1 = (const int8_t*) ((uintptr_t) a1 - kc);
- a2 = (const int8_t*) ((uintptr_t) a2 - kc);
- a3 = (const int8_t*) ((uintptr_t) a3 - kc);
-
- nc -= 4;
- } else {
- if (nc & 2) {
- *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout, 0);
- c0 += 2;
- *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout, 2);
- c1 += 2;
- *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout, 4);
- c2 += 2;
- *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout, 6);
- c3 += 2;
- vout = _mm_srli_epi32(vout, 16);
- }
- if (nc & 1) {
- *c0 = (int8_t) _mm_extract_epi8(vout, 0);
- *c1 = (int8_t) _mm_extract_epi8(vout, 4);
- *c2 = (int8_t) _mm_extract_epi8(vout, 8);
- *c3 = (int8_t) _mm_extract_epi8(vout, 12);
- }
-
- nc = 0;
- }
- } while (nc != 0);
-}
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 83739f5..99d15d1 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -938,50 +938,45 @@
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__ssse3)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__ssse3)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__sse41)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__sse41)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__avx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__avx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c2__xop)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c2__xop)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop)
-
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__avx2)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x8c8__avx2)
@@ -991,10 +986,6 @@
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_2x8c8__avx2)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_fp32_ukernel_3x8c8__avx2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2)
-DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2)
-
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x8c8__avx2)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x8c8__avx2)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x8c8__avx2)
diff --git a/test/qs8-gemm-minmax-fp32.cc b/test/qs8-gemm-minmax-fp32.cc
index bd2e271..7ccf6d1 100644
--- a/test/qs8-gemm-minmax-fp32.cc
+++ b/test/qs8-gemm-minmax-fp32.cc
@@ -45623,6 +45623,14049 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_eq_8) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_lt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_gt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_div_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_gt_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_div_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE2, strided_cm) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_eq_8) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_lt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_gt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_div_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_gt_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_div_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE2, strided_cm) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_eq_8) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_lt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_gt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_div_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_gt_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_div_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE2, strided_cm) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_eq_8) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_lt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_gt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_div_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_gt_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_div_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE2, strided_cm) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_eq_8) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_lt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_gt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_div_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_gt_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_div_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__SSE41, strided_cm) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_eq_8) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_lt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_gt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_div_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_gt_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_div_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__SSE41, strided_cm) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_eq_8) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_lt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_gt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_div_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_gt_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_div_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__SSE41, strided_cm) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_eq_8) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_lt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_gt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_div_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_gt_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_div_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__SSE41, strided_cm) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_eq_8) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_lt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_gt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_div_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, k_div_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_gt_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_div_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, n_div_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, strided_cm_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__AVX, strided_cm) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_eq_8) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_lt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_gt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_div_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, k_div_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_gt_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_div_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, n_div_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, strided_cm_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__AVX, strided_cm) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_eq_8) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_lt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_gt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_div_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, k_div_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_gt_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_div_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, n_div_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, strided_cm_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__AVX, strided_cm) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_eq_8) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_lt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_gt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_div_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, k_div_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_gt_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_div_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, n_div_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, strided_cm_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__AVX, strided_cm) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_eq_8) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_lt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_gt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_div_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, k_div_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_gt_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_div_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, n_div_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, strided_cm_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C2__XOP, strided_cm) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_eq_8) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_lt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_gt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_div_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, k_div_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_gt_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_div_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, n_div_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, strided_cm_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C2__XOP, strided_cm) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_eq_8) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_lt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_gt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_div_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, k_div_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_gt_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_div_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, n_div_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, strided_cm_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C2__XOP, strided_cm) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_eq_8) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_lt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_gt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_div_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, k_div_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_gt_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_div_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, n_div_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, strided_cm_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_4X4C2__XOP, strided_cm) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(4)
+ .nr(4)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_eq_8) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_lt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_gt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_div_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_gt_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_div_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE2, strided_cm) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_eq_8) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_lt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_gt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_div_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_gt_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_div_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE2, strided_cm) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_eq_8) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_lt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_gt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_div_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_gt_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_div_4) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE2, strided_cm) {
+ TEST_REQUIRES_X86_SSE2;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_eq_8) {
+ TEST_REQUIRES_X86_SSSE3;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, strided_cn) {
+ TEST_REQUIRES_X86_SSSE3;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_lt_8) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_gt_8) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_div_8) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_gt_4) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_div_4) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSSE3, strided_cm) {
+ TEST_REQUIRES_X86_SSSE3;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_eq_8) {
+ TEST_REQUIRES_X86_SSSE3;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, strided_cn) {
+ TEST_REQUIRES_X86_SSSE3;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_lt_8) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_gt_8) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_div_8) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_gt_4) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_div_4) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSSE3, strided_cm) {
+ TEST_REQUIRES_X86_SSSE3;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_eq_8) {
+ TEST_REQUIRES_X86_SSSE3;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, strided_cn) {
+ TEST_REQUIRES_X86_SSSE3;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_lt_8) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_gt_8) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_div_8) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_gt_4) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_div_4) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSSE3;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSSE3, strided_cm) {
+ TEST_REQUIRES_X86_SSSE3;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_fp32_sse2_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_eq_8) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_lt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_gt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_div_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_gt_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_div_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__SSE41, strided_cm) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_eq_8) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_lt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_gt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_div_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_gt_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_div_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__SSE41, strided_cm) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_eq_8) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_lt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_gt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_div_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, k_div_8_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_gt_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_div_4) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, n_div_4_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, strided_cm_subtile) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__SSE41, strided_cm) {
+ TEST_REQUIRES_X86_SSE41;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_eq_8) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_lt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_gt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_div_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, k_div_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_gt_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_div_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, n_div_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, strided_cm_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__AVX, strided_cm) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_eq_8) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_lt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_gt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_div_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, k_div_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_gt_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_div_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, n_div_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, strided_cm_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__AVX, strided_cm) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_eq_8) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_lt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_gt_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_div_8) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, k_div_8_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_gt_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_div_4) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, n_div_4_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, strided_cm_subtile) {
+ TEST_REQUIRES_X86_AVX;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__AVX, strided_cm) {
+ TEST_REQUIRES_X86_AVX;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_eq_8) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_lt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_gt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_div_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, k_div_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_gt_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_div_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, n_div_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, strided_cm_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_1X4C8__XOP, strided_cm) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(1)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(1)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_eq_8) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_lt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_gt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_div_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, k_div_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_gt_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_div_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, n_div_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, strided_cm_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_2X4C8__XOP, strided_cm) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(2)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(2)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_eq_8) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_eq_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_eq_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(4)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_lt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_lt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(11)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_lt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_gt_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_gt_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(19)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_gt_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_div_8) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_div_8_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .a_stride(83)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, k_div_8_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_gt_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_gt_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_gt_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_gt_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 5; n < 8; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_div_4) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(k)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_div_4_strided_cn) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_div_4_strided_a) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .a_stride(43)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, n_div_4_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (uint32_t n = 8; n <= 12; n += 4) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, strided_cm_subtile) {
+ TEST_REQUIRES_X86_XOP;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 4; n++) {
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(7)
+ .iterations(1)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_GEMM_XW_MINMAX_FP32_3X4C8__XOP, strided_cm) {
+ TEST_REQUIRES_X86_XOP;
+ GemmMicrokernelTester()
+ .extended_weights(true)
+ .mr(3)
+ .nr(4)
+ .kr(8)
+ .sr(1)
+ .m(3)
+ .n(4)
+ .k(8)
+ .cm_stride(7)
+ .Test(xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_fp32_sse4_params, xnn_init_qs8_requantization_fp32_params, xnn_qs8_requantize_fp32);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QS8_GEMM_MINMAX_FP32_1X8C8__AVX2, k_eq_8) {
TEST_REQUIRES_X86_AVX2;
GemmMicrokernelTester()
diff --git a/test/qs8-gemm-minmax-fp32.yaml b/test/qs8-gemm-minmax-fp32.yaml
index 3972b36..f0ffdb6 100644
--- a/test/qs8-gemm-minmax-fp32.yaml
+++ b/test/qs8-gemm-minmax-fp32.yaml
@@ -303,6 +303,99 @@
- name: xnn_qs8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128
init: xnn_init_qs8_conv_minmax_fp32_sse4_params
k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse2
+ init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse2
+ init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse2
+ init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse2
+ init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__sse41
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__sse41
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__sse41
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__sse41
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__avx
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__avx
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__avx
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__avx
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c2__xop
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c2__xop
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c2__xop
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_4x4c2__xop
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse2
+ init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse2
+ init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse2
+ init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__ssse3
+ init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__ssse3
+ init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__ssse3
+ init: xnn_init_qs8_conv_minmax_fp32_sse2_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__sse41
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__sse41
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__sse41
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__avx
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__avx
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__avx
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_1x4c8__xop
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_2x4c8__xop
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
+- name: xnn_qs8_gemm_xw_minmax_fp32_ukernel_3x4c8__xop
+ init: xnn_init_qs8_conv_minmax_fp32_sse4_params
+ k-block: 8
- name: xnn_qs8_gemm_minmax_fp32_ukernel_1x8c8__avx2
init: xnn_init_qs8_conv_minmax_fp32_avx2_params
k-block: 8
diff --git a/test/qs8-gemm-minmax-gemmlowp.cc b/test/qs8-gemm-minmax-gemmlowp.cc
index a160ecf..5c62d4b 100644
--- a/test/qs8-gemm-minmax-gemmlowp.cc
+++ b/test/qs8-gemm-minmax-gemmlowp.cc
@@ -36503,4536 +36503,6 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_eq_8) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_lt_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_gt_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_div_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_gt_4) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_div_4) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE2, strided_cm) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_eq_8) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_lt_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_gt_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_div_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_gt_4) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_div_4) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE2, strided_cm) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_eq_8) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_lt_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_gt_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_div_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_gt_4) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_div_4) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSSE3, strided_cm) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_eq_8) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_lt_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_gt_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_div_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_gt_4) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_div_4) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSSE3, strided_cm) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_eq_8) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_lt_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_gt_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_div_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_gt_4) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_div_4) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__SSE41, strided_cm) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_eq_8) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_lt_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_gt_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_div_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_gt_4) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_div_4) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__SSE41, strided_cm) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_eq_8) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, strided_cn) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_eq_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_lt_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_lt_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_gt_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_gt_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_div_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_div_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, k_div_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_gt_4) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_gt_4_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_div_4) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_div_4_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, n_div_4_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, strided_cm_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__AVX, strided_cm) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_eq_8) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, strided_cn) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_eq_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_lt_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_lt_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_gt_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_gt_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_div_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_div_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, k_div_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_gt_4) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_gt_4_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_div_4) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_div_4_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, n_div_4_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, strided_cm_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__AVX, strided_cm) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_eq_8) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, strided_cn) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_eq_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_lt_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_lt_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_gt_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_gt_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_div_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_div_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, k_div_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_gt_4) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_gt_4_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_div_4) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_div_4_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, n_div_4_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, strided_cm_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C2__XOP, strided_cm) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_eq_8) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, strided_cn) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_eq_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_lt_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_lt_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_gt_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_gt_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_div_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_div_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, k_div_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_gt_4) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_gt_4_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_div_4) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_div_4_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, n_div_4_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, strided_cm_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 4; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_4X4C2__XOP, strided_cm) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(4)
- .nr(4)
- .kr(2)
- .sr(1)
- .m(4)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QS8_GEMM_MINMAX_GEMMLOWP_2X4C8__SSE2_LD64, k_eq_8) {
TEST_REQUIRES_X86_SSE2;
GemmMicrokernelTester()
@@ -42401,6801 +37871,6 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_eq_8) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_lt_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_gt_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_div_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_gt_4) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_div_4) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE2, strided_cm) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_eq_8) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_lt_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_gt_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_div_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_gt_4) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_div_4) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE2, strided_cm) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_eq_8) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_lt_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_gt_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_div_8) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_gt_4) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_div_4) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSE2;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE2, strided_cm) {
- TEST_REQUIRES_X86_SSE2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_eq_8) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_lt_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_gt_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_div_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_gt_4) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_div_4) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSSE3, strided_cm) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_eq_8) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_lt_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_gt_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_div_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_gt_4) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_div_4) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSSE3, strided_cm) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_eq_8) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_lt_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_gt_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_div_8) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_gt_4) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_div_4) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSSE3;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSSE3, strided_cm) {
- TEST_REQUIRES_X86_SSSE3;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3, xnn_init_qs8_conv_minmax_gemmlowp_sse2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_eq_8) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_lt_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_gt_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_div_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_gt_4) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_div_4) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__SSE41, strided_cm) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_eq_8) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_lt_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_gt_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_div_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_gt_4) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_div_4) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__SSE41, strided_cm) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_eq_8) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_eq_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_lt_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_lt_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_gt_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_gt_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_div_8) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_div_8_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, k_div_8_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_gt_4) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_gt_4_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_div_4) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_div_4_strided_a) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, n_div_4_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, strided_cm_subtile) {
- TEST_REQUIRES_X86_SSE41;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__SSE41, strided_cm) {
- TEST_REQUIRES_X86_SSE41;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_eq_8) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, strided_cn) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_eq_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_lt_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_lt_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_gt_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_gt_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_div_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_div_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, k_div_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_gt_4) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_gt_4_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_div_4) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_div_4_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, n_div_4_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, strided_cm_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__AVX, strided_cm) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_eq_8) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, strided_cn) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_eq_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_lt_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_lt_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_gt_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_gt_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_div_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_div_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, k_div_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_gt_4) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_gt_4_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_div_4) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_div_4_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, n_div_4_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, strided_cm_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__AVX, strided_cm) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_eq_8) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, strided_cn) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_eq_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_lt_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_lt_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_gt_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_gt_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_div_8) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_div_8_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, k_div_8_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_gt_4) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_gt_4_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_div_4) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_div_4_strided_a) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, n_div_4_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, strided_cm_subtile) {
- TEST_REQUIRES_X86_AVX;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__AVX, strided_cm) {
- TEST_REQUIRES_X86_AVX;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_eq_8) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, strided_cn) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_eq_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_lt_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_lt_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_gt_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_gt_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_div_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_div_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, k_div_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_gt_4) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_gt_4_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_div_4) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_div_4_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, n_div_4_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, strided_cm_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X4C8__XOP, strided_cm) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(1)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_eq_8) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, strided_cn) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_eq_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_lt_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_lt_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_gt_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_gt_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_div_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_div_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, k_div_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_gt_4) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_gt_4_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_div_4) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_div_4_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, n_div_4_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, strided_cm_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X4C8__XOP, strided_cm) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(2)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_eq_8) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, strided_cn) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_eq_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(4)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_lt_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_lt_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_gt_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_gt_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_div_8) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_div_8_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, k_div_8_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_gt_4) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_gt_4_strided_cn) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_gt_4_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_gt_4_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 5; n < 8; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_div_4) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_div_4_strided_cn) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .cn_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_div_4_strided_a) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, n_div_4_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (uint32_t n = 8; n <= 12; n += 4) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, strided_cm_subtile) {
- TEST_REQUIRES_X86_XOP;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 4; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(7)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X4C8__XOP, strided_cm) {
- TEST_REQUIRES_X86_XOP;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(4)
- .kr(8)
- .sr(1)
- .m(3)
- .n(4)
- .k(8)
- .cm_stride(7)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop, xnn_init_qs8_conv_minmax_gemmlowp_sse4_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X8C8__AVX2, k_eq_8) {
TEST_REQUIRES_X86_AVX2;
GemmMicrokernelTester()
@@ -50564,1365 +39239,6 @@
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_eq_8) {
- TEST_REQUIRES_X86_AVX2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, strided_cn) {
- TEST_REQUIRES_X86_AVX2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(8)
- .cn_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_eq_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(8)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_lt_8) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_lt_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_gt_8) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_gt_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_div_8) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_div_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, k_div_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_gt_8) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_gt_8_strided_cn) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(k)
- .cn_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_gt_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_gt_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_div_8) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_div_8_strided_cn) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .cn_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_div_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, n_div_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, strided_cm_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 1; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(11)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_1X8C8__AVX2, strided_cm) {
- TEST_REQUIRES_X86_AVX2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(1)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(1)
- .n(8)
- .k(8)
- .cm_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_eq_8) {
- TEST_REQUIRES_X86_AVX2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, strided_cn) {
- TEST_REQUIRES_X86_AVX2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(8)
- .cn_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_eq_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(8)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_lt_8) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_lt_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_gt_8) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_gt_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_div_8) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_div_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, k_div_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_gt_8) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_gt_8_strided_cn) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(k)
- .cn_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_gt_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_gt_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_div_8) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_div_8_strided_cn) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .cn_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_div_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, n_div_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, strided_cm_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 2; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(11)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_2X8C8__AVX2, strided_cm) {
- TEST_REQUIRES_X86_AVX2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(2)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(2)
- .n(8)
- .k(8)
- .cm_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_eq_8) {
- TEST_REQUIRES_X86_AVX2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(8)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, strided_cn) {
- TEST_REQUIRES_X86_AVX2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(8)
- .cn_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_eq_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(8)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_eq_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_eq_8_subtile_m) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(8)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_eq_8_subtile_n) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(8)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_lt_8) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_lt_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 1; k < 8; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(k)
- .a_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_lt_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 1; k < 8; k++) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_gt_8) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_gt_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 9; k < 16; k++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(k)
- .a_stride(19)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_gt_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 9; k < 16; k++) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_div_8) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_div_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 16; k <= 80; k += 8) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(k)
- .a_stride(83)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, k_div_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 16; k <= 80; k += 8) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_gt_8) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_gt_8_strided_cn) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(k)
- .cn_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_gt_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_gt_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 9; n < 16; n++) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_div_8) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(k)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_div_8_strided_cn) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .cn_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_div_8_strided_a) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 40; k += 9) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(n)
- .k(k)
- .a_stride(43)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, n_div_8_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (uint32_t n = 16; n <= 24; n += 8) {
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, strided_cm_subtile) {
- TEST_REQUIRES_X86_AVX2;
- for (size_t k = 1; k <= 40; k += 9) {
- for (uint32_t m = 1; m <= 3; m++) {
- for (uint32_t n = 1; n <= 8; n++) {
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(m)
- .n(n)
- .k(k)
- .cm_stride(11)
- .iterations(1)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
- }
- }
- }
-
- TEST(QS8_GEMM_XW_MINMAX_GEMMLOWP_3X8C8__AVX2, strided_cm) {
- TEST_REQUIRES_X86_AVX2;
- GemmMicrokernelTester()
- .extended_weights(true)
- .mr(3)
- .nr(8)
- .kr(8)
- .sr(1)
- .m(3)
- .n(8)
- .k(8)
- .cm_stride(11)
- .Test(xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2, xnn_init_qs8_conv_minmax_gemmlowp_avx2_params, xnn_init_qs8_requantization_gemmlowp_params, xnn_qs8_requantize_gemmlowp);
- }
-#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-
-
-#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QS8_GEMM_MINMAX_GEMMLOWP_1X16C8__AVX512SKX, k_eq_8) {
TEST_REQUIRES_X86_AVX512SKX;
GemmMicrokernelTester()
diff --git a/test/qs8-gemm-minmax-gemmlowp.yaml b/test/qs8-gemm-minmax-gemmlowp.yaml
index 248f61d..65a6dd7 100644
--- a/test/qs8-gemm-minmax-gemmlowp.yaml
+++ b/test/qs8-gemm-minmax-gemmlowp.yaml
@@ -243,36 +243,6 @@
- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_4x4c2__sse41_ld64
init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse2
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse2
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__ssse3
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__ssse3
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__sse41
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__sse41
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__avx
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__avx
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c2__xop
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_4x4c2__xop
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse2_ld64
init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
k-block: 8
@@ -282,51 +252,6 @@
- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_2x4c8__sse41_ld64
init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse2
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse2
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse2
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__ssse3
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__ssse3
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__ssse3
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse2_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__sse41
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__sse41
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__sse41
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__avx
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__avx
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__avx
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x4c8__xop
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x4c8__xop
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x4c8__xop
- init: xnn_init_qs8_conv_minmax_gemmlowp_sse4_params
- k-block: 8
- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x8c8__avx2
init: xnn_init_qs8_conv_minmax_gemmlowp_avx2_params
k-block: 8
@@ -336,15 +261,6 @@
- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_3x8c8__avx2
init: xnn_init_qs8_conv_minmax_gemmlowp_avx2_params
k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_1x8c8__avx2
- init: xnn_init_qs8_conv_minmax_gemmlowp_avx2_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_2x8c8__avx2
- init: xnn_init_qs8_conv_minmax_gemmlowp_avx2_params
- k-block: 8
-- name: xnn_qs8_gemm_xw_minmax_gemmlowp_ukernel_3x8c8__avx2
- init: xnn_init_qs8_conv_minmax_gemmlowp_avx2_params
- k-block: 8
- name: xnn_qs8_gemm_minmax_gemmlowp_ukernel_1x16c8__avx512skx
init: xnn_init_qs8_conv_minmax_gemmlowp_avx512_params
k-block: 8