QS8 GEMM/IGEMM microkernels with RNDNU requantization
Enable RNDNU-requantized GEMM & IGEMM microkernels on AArch32 for a minor performance improvement on Pixel 2:
- QS8 MobileNet v1: 72716 us -> 71819 us
- QS8 MobileNet v1: 51273 us -> 50144 us
PiperOrigin-RevId: 385267824
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0d9a661..7489439 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1151,9 +1151,11 @@
src/qs8-gemm/gen/1x8c2-minmax-fp32-neon-mlal-padal-dup.c
src/qs8-gemm/gen/1x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
src/qs8-gemm/gen/1x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+ src/qs8-gemm/gen/1x8c2-minmax-rndnu-neon-mlal-padal-dup.c
src/qs8-gemm/gen/1x8c8-minmax-fp32-neon-mlal-padal.c
src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
src/qs8-gemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
+ src/qs8-gemm/gen/1x8c8-minmax-rndnu-neon-mlal-padal.c
src/qs8-gemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c
src/qs8-gemm/gen/1x16-minmax-fp32-neon-mlal-lane.c
src/qs8-gemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -1170,9 +1172,11 @@
src/qs8-gemm/gen/2x8c2-minmax-fp32-neon-mlal-padal-dup.c
src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
src/qs8-gemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+ src/qs8-gemm/gen/2x8c2-minmax-rndnu-neon-mlal-padal-dup.c
src/qs8-gemm/gen/2x8c8-minmax-fp32-neon-mlal-padal.c
src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
src/qs8-gemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
+ src/qs8-gemm/gen/2x8c8-minmax-rndnu-neon-mlal-padal.c
src/qs8-gemm/gen/2x8c16-minmax-gemmlowp-neon-mlal-padal.c
src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
src/qs8-gemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane.c
@@ -1225,9 +1229,11 @@
src/qs8-igemm/gen/1x8c2-minmax-fp32-neon-mlal-padal-dup.c
src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
src/qs8-igemm/gen/1x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+ src/qs8-igemm/gen/1x8c2-minmax-rndnu-neon-mlal-padal-dup.c
src/qs8-igemm/gen/1x8c8-minmax-fp32-neon-mlal-padal.c
src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mlal-padal.c
src/qs8-igemm/gen/1x8c8-minmax-gemmlowp-neon-mull-padal.c
+ src/qs8-igemm/gen/1x8c8-minmax-rndnu-neon-mlal-padal.c
src/qs8-igemm/gen/1x8c16-minmax-gemmlowp-neon-mlal-padal.c
src/qs8-igemm/gen/1x16-minmax-fp32-neon-mlal-lane.c
src/qs8-igemm/gen/1x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
@@ -1244,9 +1250,11 @@
src/qs8-igemm/gen/2x8c2-minmax-fp32-neon-mlal-padal-dup.c
src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mlal-padal-dup.c
src/qs8-igemm/gen/2x8c2-minmax-gemmlowp-neon-mull-padal-dup.c
+ src/qs8-igemm/gen/2x8c2-minmax-rndnu-neon-mlal-padal-dup.c
src/qs8-igemm/gen/2x8c8-minmax-fp32-neon-mlal-padal.c
src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mlal-padal.c
src/qs8-igemm/gen/2x8c8-minmax-gemmlowp-neon-mull-padal.c
+ src/qs8-igemm/gen/2x8c8-minmax-rndnu-neon-mlal-padal.c
src/qs8-igemm/gen/2x8c16-minmax-gemmlowp-neon-mlal-padal.c
src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane-prfm.c
src/qs8-igemm/gen/2x16-minmax-gemmlowp-neon-mlal-lane.c
@@ -1829,12 +1837,16 @@
src/qc8-igemm/gen/8x16c4-minmax-fp32-neondot.c
src/qs8-gemm/gen/1x8c4-minmax-fp32-neondot.c
src/qs8-gemm/gen/1x8c4-minmax-gemmlowp-neondot.c
+ src/qs8-gemm/gen/1x8c4-minmax-rndnu-neondot.c
src/qs8-gemm/gen/1x16c4-minmax-fp32-neondot.c
src/qs8-gemm/gen/1x16c4-minmax-gemmlowp-neondot.c
+ src/qs8-gemm/gen/1x16c4-minmax-rndnu-neondot.c
src/qs8-gemm/gen/4x8c4-minmax-fp32-neondot.c
src/qs8-gemm/gen/4x8c4-minmax-gemmlowp-neondot.c
+ src/qs8-gemm/gen/4x8c4-minmax-rndnu-neondot.c
src/qs8-gemm/gen/4x16c4-minmax-fp32-neondot.c
src/qs8-gemm/gen/4x16c4-minmax-gemmlowp-neondot.c
+ src/qs8-gemm/gen/4x16c4-minmax-rndnu-neondot.c
src/qs8-gemm/gen/6x8c4-minmax-fp32-neondot.c
src/qs8-gemm/gen/6x8c4-minmax-gemmlowp-neondot.c
src/qs8-gemm/gen/6x16c4-minmax-fp32-neondot.c
@@ -1845,12 +1857,16 @@
src/qs8-gemm/gen/8x16c4-minmax-gemmlowp-neondot.c
src/qs8-igemm/gen/1x8c4-minmax-fp32-neondot.c
src/qs8-igemm/gen/1x8c4-minmax-gemmlowp-neondot.c
+ src/qs8-igemm/gen/1x8c4-minmax-rndnu-neondot.c
src/qs8-igemm/gen/1x16c4-minmax-fp32-neondot.c
src/qs8-igemm/gen/1x16c4-minmax-gemmlowp-neondot.c
+ src/qs8-igemm/gen/1x16c4-minmax-rndnu-neondot.c
src/qs8-igemm/gen/4x8c4-minmax-fp32-neondot.c
src/qs8-igemm/gen/4x8c4-minmax-gemmlowp-neondot.c
+ src/qs8-igemm/gen/4x8c4-minmax-rndnu-neondot.c
src/qs8-igemm/gen/4x16c4-minmax-fp32-neondot.c
src/qs8-igemm/gen/4x16c4-minmax-gemmlowp-neondot.c
+ src/qs8-igemm/gen/4x16c4-minmax-rndnu-neondot.c
src/qs8-igemm/gen/6x8c4-minmax-fp32-neondot.c
src/qs8-igemm/gen/6x8c4-minmax-gemmlowp-neondot.c
src/qs8-igemm/gen/6x16c4-minmax-fp32-neondot.c
@@ -5422,6 +5438,15 @@
TARGET_LINK_LIBRARIES(qs8-gavgpool-minmax-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
ADD_TEST(qs8-gavgpool-test qs8-gavgpool-minmax-test)
+ ADD_EXECUTABLE(qs8-gemm-minmax-fp32-test test/qs8-gemm-minmax-fp32.cc)
+ SET_TARGET_PROPERTIES(qs8-gemm-minmax-fp32-test PROPERTIES
+ CXX_STANDARD 11
+ CXX_STANDARD_REQUIRED YES
+ CXX_EXTENSIONS YES)
+ TARGET_INCLUDE_DIRECTORIES(qs8-gemm-minmax-fp32-test PRIVATE src test)
+ TARGET_LINK_LIBRARIES(qs8-gemm-minmax-fp32-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+ ADD_TEST(qs8-gemm-minmax-fp32-test qs8-gemm-minmax-fp32-test)
+
ADD_EXECUTABLE(qs8-gemm-minmax-gemmlowp-test test/qs8-gemm-minmax-gemmlowp.cc)
SET_TARGET_PROPERTIES(qs8-gemm-minmax-gemmlowp-test PROPERTIES
CXX_STANDARD 11
@@ -5431,14 +5456,23 @@
TARGET_LINK_LIBRARIES(qs8-gemm-minmax-gemmlowp-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
ADD_TEST(qs8-gemm-minmax-gemmlowp-test qs8-gemm-minmax-gemmlowp-test)
- ADD_EXECUTABLE(qs8-gemm-minmax-fp32-test test/qs8-gemm-minmax-fp32.cc)
- SET_TARGET_PROPERTIES(qs8-gemm-minmax-fp32-test PROPERTIES
+ ADD_EXECUTABLE(qs8-gemm-minmax-rndnu-test test/qs8-gemm-minmax-rndnu.cc)
+ SET_TARGET_PROPERTIES(qs8-gemm-minmax-rndnu-test PROPERTIES
CXX_STANDARD 11
CXX_STANDARD_REQUIRED YES
CXX_EXTENSIONS YES)
- TARGET_INCLUDE_DIRECTORIES(qs8-gemm-minmax-fp32-test PRIVATE src test)
- TARGET_LINK_LIBRARIES(qs8-gemm-minmax-fp32-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
- ADD_TEST(qs8-gemm-minmax-fp32-test qs8-gemm-minmax-fp32-test)
+ TARGET_INCLUDE_DIRECTORIES(qs8-gemm-minmax-rndnu-test PRIVATE src test)
+ TARGET_LINK_LIBRARIES(qs8-gemm-minmax-rndnu-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+ ADD_TEST(qs8-gemm-minmax-rndnu-test qs8-gemm-minmax-rndnu-test)
+
+ ADD_EXECUTABLE(qs8-igemm-minmax-fp32-test test/qs8-igemm-minmax-fp32.cc)
+ SET_TARGET_PROPERTIES(qs8-igemm-minmax-fp32-test PROPERTIES
+ CXX_STANDARD 11
+ CXX_STANDARD_REQUIRED YES
+ CXX_EXTENSIONS YES)
+ TARGET_INCLUDE_DIRECTORIES(qs8-igemm-minmax-fp32-test PRIVATE src test)
+ TARGET_LINK_LIBRARIES(qs8-igemm-minmax-fp32-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+ ADD_TEST(qs8-igemm-minmax-fp32-test qs8-igemm-minmax-fp32-test)
ADD_EXECUTABLE(qs8-igemm-minmax-gemmlowp-test test/qs8-igemm-minmax-gemmlowp.cc)
SET_TARGET_PROPERTIES(qs8-igemm-minmax-gemmlowp-test PROPERTIES
@@ -5449,14 +5483,14 @@
TARGET_LINK_LIBRARIES(qs8-igemm-minmax-gemmlowp-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
ADD_TEST(qs8-igemm-minmax-gemmlowp-test qs8-igemm-minmax-gemmlowp-test)
- ADD_EXECUTABLE(qs8-igemm-minmax-fp32-test test/qs8-igemm-minmax-fp32.cc)
- SET_TARGET_PROPERTIES(qs8-igemm-minmax-fp32-test PROPERTIES
+ ADD_EXECUTABLE(qs8-igemm-minmax-rndnu-test test/qs8-igemm-minmax-rndnu.cc)
+ SET_TARGET_PROPERTIES(qs8-igemm-minmax-rndnu-test PROPERTIES
CXX_STANDARD 11
CXX_STANDARD_REQUIRED YES
CXX_EXTENSIONS YES)
- TARGET_INCLUDE_DIRECTORIES(qs8-igemm-minmax-fp32-test PRIVATE src test)
- TARGET_LINK_LIBRARIES(qs8-igemm-minmax-fp32-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
- ADD_TEST(qs8-igemm-minmax-fp32-test qs8-igemm-minmax-fp32-test)
+ TARGET_INCLUDE_DIRECTORIES(qs8-igemm-minmax-rndnu-test PRIVATE src test)
+ TARGET_LINK_LIBRARIES(qs8-igemm-minmax-rndnu-test PRIVATE XNNPACK cpuinfo fp16 gtest gtest_main)
+ ADD_TEST(qs8-igemm-minmax-rndnu-test qs8-igemm-minmax-rndnu-test)
ADD_EXECUTABLE(qs8-vadd-minmax-test test/qs8-vadd-minmax.cc)
SET_TARGET_PROPERTIES(qs8-vadd-minmax-test PROPERTIES