Additional Sigmoid micro-kernels and accuracy evaluation stub
- PSIMD micro-kernels and accuracy evaluation stubs
- ARM NEON micro-kernels using 2048-entry table lookups
- ARM NEON micro-kernels with alternative division implementations
- ARM NEON micro-kernels without FMA
- x4..x24 version of all SIMD micro-kernels
- Eliminated comparison with one_cutoff & corresponding blend in all
micro-kernels
PiperOrigin-RevId: 287804583
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ffd1260..b5d1566 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -320,7 +320,7 @@
src/x8-zip/x4-scalar.c
src/x8-zip/xm-scalar.c)
-SET(XNNPACK_PSIMD_MICROKERNEL_SRCS
+SET(XNNPACK_PSIMD_FASTMATH_MICROKERNEL_SRCS
src/f32-argmaxpool/4x-psimd-c4.c
src/f32-argmaxpool/9p8x-psimd-c4.c
src/f32-argmaxpool/9x-psimd-c4.c
@@ -417,6 +417,15 @@
src/x32-zip/x4-psimd.c
src/x32-zip/xm-psimd.c)
+SET(XNNPACK_PSIMD_ACCMATH_MICROKERNEL_SRCS
+ src/f32-sigmoid/gen/psimd-p5-div-x4.c
+ src/f32-sigmoid/gen/psimd-p5-div-x8.c
+ src/f32-sigmoid/gen/psimd-p5-div-x12.c
+ src/f32-sigmoid/gen/psimd-p5-div-x16.c
+ src/f32-sigmoid/gen/psimd-p5-div-x20.c
+ src/f32-sigmoid/gen/psimd-p5-div-x24.c
+ src/math/sigmoid-psimd-p5-div.c)
+
SET(XNNPACK_NEON_MICROKERNEL_SRCS
src/f32-avgpool/mp9p8q-neon.c
src/f32-avgpool/up9-neon.c
@@ -487,6 +496,18 @@
src/f32-prelu/gen/neon-2x8.c
src/f32-rmax/neon.c
src/f32-sigmoid/gen/neon-frac-p9-p10-nr1recps-x16.c
+ src/f32-sigmoid/gen/neon-p5-nr2recps-x4.c
+ src/f32-sigmoid/gen/neon-p5-nr2recps-x8.c
+ src/f32-sigmoid/gen/neon-p5-nr2recps-x12.c
+ src/f32-sigmoid/gen/neon-p5-nr2recps-x16.c
+ src/f32-sigmoid/gen/neon-p5-nr2recps-x20.c
+ src/f32-sigmoid/gen/neon-p5-nr2recps-x24.c
+ src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x4.c
+ src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x8.c
+ src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x12.c
+ src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x16.c
+ src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x20.c
+ src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x24.c
src/f32-vbinary/gen/vadd-neon-x4.c
src/f32-vbinary/gen/vadd-neon-x8.c
src/f32-vbinary/gen/vaddc-neon-x4.c
@@ -533,7 +554,9 @@
src/x8-zip/x2-neon.c
src/x8-zip/x3-neon.c
src/x8-zip/x4-neon.c
- src/x8-zip/xm-neon.c)
+ src/x8-zip/xm-neon.c
+ src/math/sigmoid-neon-lut2048-p1-nr2recps.c
+ src/math/sigmoid-neon-p5-nr2recps.c)
SET(XNNPACK_NEONFMA_MICROKERNEL_SRCS
src/f32-bilinear/gen/neonfma-c4.c
@@ -573,7 +596,42 @@
src/f32-hswish/gen/neonfma-x8.c
src/f32-ppmm/gen/4x8-neonfma.c
src/f32-ppmm/gen/8x8-neonfma.c
+ src/f32-sigmoid/gen/neonfma-p5-nr2fma-x4.c
+ src/f32-sigmoid/gen/neonfma-p5-nr2fma-x8.c
+ src/f32-sigmoid/gen/neonfma-p5-nr2fma-x12.c
src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c
+ src/f32-sigmoid/gen/neonfma-p5-nr2fma-x20.c
+ src/f32-sigmoid/gen/neonfma-p5-nr2fma-x24.c
+ src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x4.c
+ src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x8.c
+ src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x12.c
+ src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x16.c
+ src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x20.c
+ src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x24.c
+ src/f32-sigmoid/gen/neonfma-p5-nr2recps-x4.c
+ src/f32-sigmoid/gen/neonfma-p5-nr2recps-x8.c
+ src/f32-sigmoid/gen/neonfma-p5-nr2recps-x12.c
+ src/f32-sigmoid/gen/neonfma-p5-nr2recps-x16.c
+ src/f32-sigmoid/gen/neonfma-p5-nr2recps-x20.c
+ src/f32-sigmoid/gen/neonfma-p5-nr2recps-x24.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x4.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x8.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x12.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x16.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x20.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x24.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x4.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x8.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x12.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x16.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x20.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x24.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x4.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x8.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x12.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x16.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x20.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x24.c
src/f32-vmulcaddc/gen/c4-neonfma-2x.c
src/f32-vmulcaddc/gen/c8-neonfma-2x.c
src/math/exp-neonfma-lut64-p2.c
@@ -622,6 +680,18 @@
src/f32-dwconv-spchw/5x5p2-neonfma.c
src/f32-dwconv-spchw/3x3s2p1-neonfma.c
src/f32-dwconv-spchw/5x5s2p2-neonfma.c
+ src/f32-sigmoid/gen/neonfma-p5-div-x4.c
+ src/f32-sigmoid/gen/neonfma-p5-div-x8.c
+ src/f32-sigmoid/gen/neonfma-p5-div-x12.c
+ src/f32-sigmoid/gen/neonfma-p5-div-x16.c
+ src/f32-sigmoid/gen/neonfma-p5-div-x20.c
+ src/f32-sigmoid/gen/neonfma-p5-div-x24.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x4.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x8.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x12.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x16.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x20.c
+ src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x24.c
src/f32-spmm/gen/12x1-neonfma.c
src/f32-spmm/gen/12x2-neonfma.c
src/f32-spmm/gen/12x4-neonfma.c
@@ -737,8 +807,12 @@
src/f32-argmaxpool/9x-sse2-c4.c
src/f32-prelu/gen/sse2-2x4.c
src/f32-prelu/gen/sse2-2x8.c
+ src/f32-sigmoid/gen/sse2-p5-div-x4.c
src/f32-sigmoid/gen/sse2-p5-div-x8.c
+ src/f32-sigmoid/gen/sse2-p5-div-x12.c
src/f32-sigmoid/gen/sse2-p5-div-x16.c
+ src/f32-sigmoid/gen/sse2-p5-div-x20.c
+ src/f32-sigmoid/gen/sse2-p5-div-x24.c
src/q8-avgpool/mp9p8q-sse2.c
src/q8-avgpool/up9-sse2.c
src/q8-igemm/4x4c2-sse2.c
@@ -767,8 +841,12 @@
SET(XNNPACK_SSE41_MICROKERNEL_SRCS
src/f32-prelu/gen/sse41-2x4.c
src/f32-prelu/gen/sse41-2x8.c
+ src/f32-sigmoid/gen/sse41-p5-div-x4.c
src/f32-sigmoid/gen/sse41-p5-div-x8.c
- src/f32-sigmoid/gen/sse41-p5-div-x16.c)
+ src/f32-sigmoid/gen/sse41-p5-div-x12.c
+ src/f32-sigmoid/gen/sse41-p5-div-x16.c
+ src/f32-sigmoid/gen/sse41-p5-div-x20.c
+ src/f32-sigmoid/gen/sse41-p5-div-x24.c)
SET(XNNPACK_AVX_MICROKERNEL_SRCS
src/f32-clamp/avx.c
@@ -1162,7 +1240,8 @@
SET(XNNPACK_MICROKERNEL_SRCS ${XNNPACK_SCALAR_MICROKERNEL_SRCS})
IF(NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
- LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_PSIMD_MICROKERNEL_SRCS})
+ LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_PSIMD_FASTMATH_MICROKERNEL_SRCS})
+ LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_PSIMD_ACCMATH_MICROKERNEL_SRCS})
ENDIF()
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv[5-8]" OR IOS_ARCH MATCHES "^armv7")
LIST(APPEND XNNPACK_MICROKERNEL_SRCS ${XNNPACK_NEON_MICROKERNEL_SRCS})
@@ -1200,7 +1279,7 @@
C_EXTENSIONS YES)
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv[5-8]" OR IOS_ARCH MATCHES "^armv7")
SET_PROPERTY(SOURCE ${XNNPACK_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -marm ")
- SET_PROPERTY(SOURCE ${XNNPACK_PSIMD_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -mfpu=neon ")
+ SET_PROPERTY(SOURCE ${XNNPACK_PSIMD_FASTMATH_MICROKERNEL_SRCS} ${XNNPACK_PSIMD_ACCMATH_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -mfpu=neon ")
SET_PROPERTY(SOURCE ${XNNPACK_NEON_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -mfpu=neon ")
SET_PROPERTY(SOURCE ${XNNPACK_NEONFMA_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -mfpu=neon-vfpv4 ")
IF(IOS)
@@ -1231,7 +1310,8 @@
SET_PROPERTY(SOURCE ${XNNPACK_COLD_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -Os ")
ENDIF()
IF(NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
- SET_PROPERTY(SOURCE ${XNNPACK_PSIMD_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -O3 -ffast-math ")
+ SET_PROPERTY(SOURCE ${XNNPACK_PSIMD_ACCMATH_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -O3 ")
+ SET_PROPERTY(SOURCE ${XNNPACK_PSIMD_FASTMATH_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -O3 -ffast-math ")
ENDIF()
TARGET_INCLUDE_DIRECTORIES(XNNPACK PUBLIC include)