Additional Sigmoid micro-kernels and accuracy evaluation stub
- PSIMD micro-kernels and accuracy evaluation stubs
- ARM NEON micro-kernels using 2048-entry table lookups
- ARM NEON micro-kernels with alternative division implementations
- ARM NEON micro-kernels without FMA
- x4..x24 version of all SIMD micro-kernels
- Eliminated comparison with one_cutoff & corresponding blend in all
micro-kernels
PiperOrigin-RevId: 287804583
diff --git a/BUILD.bazel b/BUILD.bazel
index 4a136c8..fa4e19e 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -316,7 +316,7 @@
"src/f32-vmulcaddc/gen/c4-wasm-2x.c",
]
-PSIMD_UKERNELS = [
+PSIMD_FASTMATH_UKERNELS = [
"src/f32-argmaxpool/4x-psimd-c4.c",
"src/f32-argmaxpool/9p8x-psimd-c4.c",
"src/f32-argmaxpool/9x-psimd-c4.c",
@@ -414,6 +414,16 @@
"src/x32-zip/xm-psimd.c",
]
+PSIMD_ACCMATH_UKERNELS = [
+ "src/f32-sigmoid/gen/psimd-p5-div-x4.c",
+ "src/f32-sigmoid/gen/psimd-p5-div-x8.c",
+ "src/f32-sigmoid/gen/psimd-p5-div-x12.c",
+ "src/f32-sigmoid/gen/psimd-p5-div-x16.c",
+ "src/f32-sigmoid/gen/psimd-p5-div-x20.c",
+ "src/f32-sigmoid/gen/psimd-p5-div-x24.c",
+ "src/math/sigmoid-psimd-p5-div.c",
+]
+
# ISA-specific micro-kernels
NEON_UKERNELS = [
"src/f32-avgpool/mp9p8q-neon.c",
@@ -485,6 +495,18 @@
"src/f32-prelu/gen/neon-2x8.c",
"src/f32-rmax/neon.c",
"src/f32-sigmoid/gen/neon-frac-p9-p10-nr1recps-x16.c",
+ "src/f32-sigmoid/gen/neon-p5-nr2recps-x4.c",
+ "src/f32-sigmoid/gen/neon-p5-nr2recps-x8.c",
+ "src/f32-sigmoid/gen/neon-p5-nr2recps-x12.c",
+ "src/f32-sigmoid/gen/neon-p5-nr2recps-x16.c",
+ "src/f32-sigmoid/gen/neon-p5-nr2recps-x20.c",
+ "src/f32-sigmoid/gen/neon-p5-nr2recps-x24.c",
+ "src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x4.c",
+ "src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x8.c",
+ "src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x12.c",
+ "src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x16.c",
+ "src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x20.c",
+ "src/f32-sigmoid/gen/neon-lut2048-p1-nr2recps-x24.c",
"src/f32-vbinary/gen/vadd-neon-x4.c",
"src/f32-vbinary/gen/vadd-neon-x8.c",
"src/f32-vbinary/gen/vaddc-neon-x4.c",
@@ -532,6 +554,8 @@
"src/x8-zip/x3-neon.c",
"src/x8-zip/x4-neon.c",
"src/x8-zip/xm-neon.c",
+ "src/math/sigmoid-neon-lut2048-p1-nr2recps.c",
+ "src/math/sigmoid-neon-p5-nr2recps.c",
]
NEONFMA_UKERNELS = [
@@ -572,7 +596,42 @@
"src/f32-hswish/gen/neonfma-x8.c",
"src/f32-ppmm/gen/4x8-neonfma.c",
"src/f32-ppmm/gen/8x8-neonfma.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr2fma-x4.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr2fma-x8.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr2fma-x12.c",
"src/f32-sigmoid/gen/neonfma-p5-nr2fma-x16.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr2fma-x20.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr2fma-x24.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x4.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x8.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x12.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x16.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x20.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr1recps1fma-x24.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr2recps-x4.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr2recps-x8.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr2recps-x12.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr2recps-x16.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr2recps-x20.c",
+ "src/f32-sigmoid/gen/neonfma-p5-nr2recps-x24.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x4.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x8.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x12.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x16.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x20.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2fma-x24.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x4.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x8.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x12.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x16.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x20.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr1recps1fma-x24.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x4.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x8.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x12.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x16.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x20.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-nr2recps-x24.c",
"src/f32-vmulcaddc/gen/c4-neonfma-2x.c",
"src/f32-vmulcaddc/gen/c8-neonfma-2x.c",
"src/math/exp-neonfma-lut64-p2.c",
@@ -622,6 +681,18 @@
"src/f32-dwconv-spchw/5x5p2-neonfma.c",
"src/f32-dwconv-spchw/3x3s2p1-neonfma.c",
"src/f32-dwconv-spchw/5x5s2p2-neonfma.c",
+ "src/f32-sigmoid/gen/neonfma-p5-div-x4.c",
+ "src/f32-sigmoid/gen/neonfma-p5-div-x8.c",
+ "src/f32-sigmoid/gen/neonfma-p5-div-x12.c",
+ "src/f32-sigmoid/gen/neonfma-p5-div-x16.c",
+ "src/f32-sigmoid/gen/neonfma-p5-div-x20.c",
+ "src/f32-sigmoid/gen/neonfma-p5-div-x24.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x4.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x8.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x12.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x16.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x20.c",
+ "src/f32-sigmoid/gen/neonfma-lut2048-p1-div-x24.c",
"src/f32-spmm/gen/12x1-neonfma.c",
"src/f32-spmm/gen/12x2-neonfma.c",
"src/f32-spmm/gen/12x4-neonfma.c",
@@ -740,8 +811,12 @@
"src/f32-argmaxpool/9x-sse2-c4.c",
"src/f32-prelu/gen/sse2-2x4.c",
"src/f32-prelu/gen/sse2-2x8.c",
+ "src/f32-sigmoid/gen/sse2-p5-div-x4.c",
"src/f32-sigmoid/gen/sse2-p5-div-x8.c",
+ "src/f32-sigmoid/gen/sse2-p5-div-x12.c",
"src/f32-sigmoid/gen/sse2-p5-div-x16.c",
+ "src/f32-sigmoid/gen/sse2-p5-div-x20.c",
+ "src/f32-sigmoid/gen/sse2-p5-div-x24.c",
"src/q8-avgpool/mp9p8q-sse2.c",
"src/q8-avgpool/up9-sse2.c",
"src/q8-igemm/4x4c2-sse2.c",
@@ -771,8 +846,12 @@
SSE41_UKERNELS = [
"src/f32-prelu/gen/sse41-2x4.c",
"src/f32-prelu/gen/sse41-2x8.c",
+ "src/f32-sigmoid/gen/sse41-p5-div-x4.c",
"src/f32-sigmoid/gen/sse41-p5-div-x8.c",
+ "src/f32-sigmoid/gen/sse41-p5-div-x12.c",
"src/f32-sigmoid/gen/sse41-p5-div-x16.c",
+ "src/f32-sigmoid/gen/sse41-p5-div-x20.c",
+ "src/f32-sigmoid/gen/sse41-p5-div-x24.c",
]
AVX_UKERNELS = [
@@ -1300,8 +1379,8 @@
)
xnnpack_cc_library(
- name = "psimd_ukernels",
- srcs = PSIMD_UKERNELS,
+ name = "psimd_fastmath_ukernels",
+ srcs = PSIMD_FASTMATH_UKERNELS,
hdrs = INTERNAL_HDRS,
aarch32_copts = [
"-marm",
@@ -1321,6 +1400,26 @@
)
xnnpack_cc_library(
+ name = "psimd_accmath_ukernels",
+ srcs = PSIMD_ACCMATH_UKERNELS,
+ hdrs = INTERNAL_HDRS,
+ aarch32_copts = [
+ "-marm",
+ "-mfpu=neon",
+ ],
+ copts = xnnpack_std_copts(),
+ optimized_copts = [
+ "-O3",
+ ],
+ deps = [
+ ":tables",
+ "@FP16",
+ "@psimd",
+ "@pthreadpool",
+ ],
+)
+
+xnnpack_cc_library(
name = "neon_ukernels",
hdrs = INTERNAL_HDRS,
aarch32_copts = [
@@ -1460,13 +1559,15 @@
xnnpack_aggregate_library(
name = "ukernels",
aarch32_deps = [
- ":psimd_ukernels",
+ ":psimd_fastmath_ukernels",
+ ":psimd_accmath_ukernels",
":neon_ukernels",
":neonfma_ukernels",
":asm_ukernels",
],
aarch64_deps = [
- ":psimd_ukernels",
+ ":psimd_fastmath_ukernels",
+ ":psimd_accmath_ukernels",
":neon_ukernels",
":neonfma_ukernels",
":neonfp16arith_ukernels",
@@ -1478,10 +1579,12 @@
],
wasmsimd_deps = [
":wasm_ukernels",
- ":psimd_ukernels",
+ ":psimd_fastmath_ukernels",
+ ":psimd_accmath_ukernels",
],
x86_deps = [
- ":psimd_ukernels",
+ ":psimd_fastmath_ukernels",
+ ":psimd_accmath_ukernels",
":sse2_ukernels",
":sse41_ukernels",
":avx_ukernels",