Reoptimize HSWISH microkernels
- Compute Hard Swish as (x * (1/6)) * min(max(x + 3), 0), 6) instead of x *
min(max(x * (1/6) + 0.5), 0, 1) to expose more ILP
- Use integer minimum/maximum instructions in NEON and WAsm SIMD
- Remove NEON FMA microkernels
- Remove PSIMD microkernels
- Combine x86 and ARM versions of WAsm SIMD microkernels
Performance:
- Cortex-A73: 10.1 GB/s -> 11.8 GB/s
- Cortex-A73 (WAsm SIMD): 8.4 GB/s -> 8.9 GB/s
- Skylake-X (WAsm SIMD): 24 GB/s -> 51 GB/s
PiperOrigin-RevId: 320719817
diff --git a/BUILD.bazel b/BUILD.bazel
index 8bae09e..70ecd13 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -657,12 +657,9 @@
"src/f32-igemm/gen/6x8s4-minmax-wasmsimd-x86.c",
"src/f32-igemm/gen/4x2c4-minmax-wasmsimd-arm.c",
"src/f32-igemm/gen/4x2c4-minmax-wasmsimd-x86.c",
- "src/f32-hswish/gen/wasmsimd-arm-x4.c",
- "src/f32-hswish/gen/wasmsimd-arm-x8.c",
- "src/f32-hswish/gen/wasmsimd-arm-x16.c",
- "src/f32-hswish/gen/wasmsimd-x86-x4.c",
- "src/f32-hswish/gen/wasmsimd-x86-x8.c",
- "src/f32-hswish/gen/wasmsimd-x86-x16.c",
+ "src/f32-hswish/gen/wasmsimd-x4.c",
+ "src/f32-hswish/gen/wasmsimd-x8.c",
+ "src/f32-hswish/gen/wasmsimd-x16.c",
"src/f32-maxpool/9p8x-minmax-wasmsimd-arm-c4.c",
"src/f32-maxpool/9p8x-minmax-wasmsimd-x86-c4.c",
"src/f32-prelu/gen/wasmsimd-bitselect-2x4.c",
@@ -821,8 +818,6 @@
"src/f32-gemm/gen-inc/6x8inc-minmax-psimd-loadsplat.c",
"src/f32-gemm/gen-inc/6x8inc-minmax-psimd-splat.c",
"src/f32-gemm/gen-inc/6x8s4inc-minmax-psimd.c",
- "src/f32-hswish/gen/psimd-x4.c",
- "src/f32-hswish/gen/psimd-x8.c",
"src/f32-ibilinear/gen/psimd-c4.c",
"src/f32-ibilinear/gen/psimd-c8.c",
"src/f32-igemm/gen/1x8-minmax-psimd-loadsplat.c",
@@ -1198,8 +1193,6 @@
"src/f32-gemm/gen-inc/4x8s4inc-minmax-neonfma.c",
"src/f32-gemm/gen-inc/6x8s4inc-minmax-neonfma.c",
"src/f32-gemm/gen-inc/8x8s4inc-minmax-neonfma.c",
- "src/f32-hswish/gen/neonfma-x4.c",
- "src/f32-hswish/gen/neonfma-x8.c",
"src/f32-ppmm/gen/4x8-minmax-neonfma.c",
"src/f32-ppmm/gen/8x8-minmax-neonfma.c",
"src/f32-raddstoreexpminusmax/gen/neonfma-p5-x4.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ea7463f..12dcf21 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -558,8 +558,6 @@
src/f32-gemm/gen-inc/6x8inc-minmax-psimd-loadsplat.c
src/f32-gemm/gen-inc/6x8inc-minmax-psimd-splat.c
src/f32-gemm/gen-inc/6x8s4inc-minmax-psimd.c
- src/f32-hswish/gen/psimd-x4.c
- src/f32-hswish/gen/psimd-x8.c
src/f32-ibilinear/gen/psimd-c4.c
src/f32-ibilinear/gen/psimd-c8.c
src/f32-igemm/gen/1x8-minmax-psimd-loadsplat.c
@@ -931,8 +929,6 @@
src/f32-gemm/gen-inc/4x8s4inc-minmax-neonfma.c
src/f32-gemm/gen-inc/6x8s4inc-minmax-neonfma.c
src/f32-gemm/gen-inc/8x8s4inc-minmax-neonfma.c
- src/f32-hswish/gen/neonfma-x4.c
- src/f32-hswish/gen/neonfma-x8.c
src/f32-ppmm/gen/4x8-minmax-neonfma.c
src/f32-ppmm/gen/8x8-minmax-neonfma.c
src/f32-raddstoreexpminusmax/gen/neonfma-p5-x4.c
diff --git a/bench/f32-hswish.cc b/bench/f32-hswish.cc
index feef5da..636b849 100644
--- a/bench/f32-hswish.cc
+++ b/bench/f32-hswish.cc
@@ -66,15 +66,6 @@
->RangeMultiplier(10)
->Range(1000, 1000000)
->UseRealTime();
-
- BENCHMARK_CAPTURE(f32_hswish, neonfma_x4, xnn_f32_hswish_ukernel__neonfma_x4, benchmark::utils::CheckNEONFMA)
- ->RangeMultiplier(10)
- ->Range(1000, 1000000)
- ->UseRealTime();
- BENCHMARK_CAPTURE(f32_hswish, neonfma_x8, xnn_f32_hswish_ukernel__neonfma_x8, benchmark::utils::CheckNEONFMA)
- ->RangeMultiplier(10)
- ->Range(1000, 1000000)
- ->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
@@ -115,40 +106,16 @@
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- BENCHMARK_CAPTURE(f32_hswish, psimd_x4, xnn_f32_hswish_ukernel__psimd_x4)
- ->RangeMultiplier(10)
- ->Range(1000, 1000000)
- ->UseRealTime();
- BENCHMARK_CAPTURE(f32_hswish, psimd_x8, xnn_f32_hswish_ukernel__psimd_x8)
- ->RangeMultiplier(10)
- ->Range(1000, 1000000)
- ->UseRealTime();
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
#if XNN_ARCH_WASMSIMD
- BENCHMARK_CAPTURE(f32_hswish, wasmsimd_arm_x4, xnn_f32_hswish_ukernel__wasmsimd_arm_x4)
+ BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x4, xnn_f32_hswish_ukernel__wasmsimd_x4)
->RangeMultiplier(10)
->Range(1000, 1000000)
->UseRealTime();
- BENCHMARK_CAPTURE(f32_hswish, wasmsimd_arm_x8, xnn_f32_hswish_ukernel__wasmsimd_arm_x8)
+ BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x8, xnn_f32_hswish_ukernel__wasmsimd_x8)
->RangeMultiplier(10)
->Range(1000, 1000000)
->UseRealTime();
- BENCHMARK_CAPTURE(f32_hswish, wasmsimd_arm_x16, xnn_f32_hswish_ukernel__wasmsimd_arm_x16)
- ->RangeMultiplier(10)
- ->Range(1000, 1000000)
- ->UseRealTime();
-
- BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x86_x4, xnn_f32_hswish_ukernel__wasmsimd_x86_x4)
- ->RangeMultiplier(10)
- ->Range(1000, 1000000)
- ->UseRealTime();
- BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x86_x8, xnn_f32_hswish_ukernel__wasmsimd_x86_x8)
- ->RangeMultiplier(10)
- ->Range(1000, 1000000)
- ->UseRealTime();
- BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x86_x16, xnn_f32_hswish_ukernel__wasmsimd_x86_x16)
+ BENCHMARK_CAPTURE(f32_hswish, wasmsimd_x16, xnn_f32_hswish_ukernel__wasmsimd_x16)
->RangeMultiplier(10)
->Range(1000, 1000000)
->UseRealTime();
diff --git a/scripts/generate-f32-hswish.sh b/scripts/generate-f32-hswish.sh
index 3155c0d..4c9b51b 100755
--- a/scripts/generate-f32-hswish.sh
+++ b/scripts/generate-f32-hswish.sh
@@ -16,24 +16,13 @@
tools/xngen src/f32-hswish/scalar.c.in -D BATCH_TILE=4 -D WASM=1 -o src/f32-hswish/gen/wasm-x4.c
################################## WAsm SIMD ##################################
-tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=4 -D X86=0 -o src/f32-hswish/gen/wasmsimd-arm-x4.c
-tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=8 -D X86=0 -o src/f32-hswish/gen/wasmsimd-arm-x8.c
-tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=16 -D X86=0 -o src/f32-hswish/gen/wasmsimd-arm-x16.c
-
-tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=4 -D X86=1 -o src/f32-hswish/gen/wasmsimd-x86-x4.c
-tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=8 -D X86=1 -o src/f32-hswish/gen/wasmsimd-x86-x8.c
-tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=16 -D X86=1 -o src/f32-hswish/gen/wasmsimd-x86-x16.c
+tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=4 -o src/f32-hswish/gen/wasmsimd-x4.c
+tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=8 -o src/f32-hswish/gen/wasmsimd-x8.c
+tools/xngen src/f32-hswish/wasmsimd.c.in -D BATCH_TILE=16 -o src/f32-hswish/gen/wasmsimd-x16.c
################################### ARM NEON ##################################
-tools/xngen src/f32-hswish/neon.c.in -D BATCH_TILE=4 -D FMA=0 -o src/f32-hswish/gen/neon-x4.c
-tools/xngen src/f32-hswish/neon.c.in -D BATCH_TILE=8 -D FMA=0 -o src/f32-hswish/gen/neon-x8.c
-
-tools/xngen src/f32-hswish/neon.c.in -D BATCH_TILE=4 -D FMA=1 -o src/f32-hswish/gen/neonfma-x4.c
-tools/xngen src/f32-hswish/neon.c.in -D BATCH_TILE=8 -D FMA=1 -o src/f32-hswish/gen/neonfma-x8.c
-
-#################################### PSIMD ####################################
-tools/xngen src/f32-hswish/psimd.c.in -D BATCH_TILE=4 -o src/f32-hswish/gen/psimd-x4.c
-tools/xngen src/f32-hswish/psimd.c.in -D BATCH_TILE=8 -o src/f32-hswish/gen/psimd-x8.c
+tools/xngen src/f32-hswish/neon.c.in -D BATCH_TILE=4 -o src/f32-hswish/gen/neon-x4.c
+tools/xngen src/f32-hswish/neon.c.in -D BATCH_TILE=8 -o src/f32-hswish/gen/neon-x8.c
################################# x86 128-bit #################################
tools/xngen src/f32-hswish/sse.c.in -D BATCH_TILE=4 -o src/f32-hswish/gen/sse-x4.c
diff --git a/src/f32-hswish/gen/neon-x4.c b/src/f32-hswish/gen/neon-x4.c
index 5f9a1fe..818308d 100644
--- a/src/f32-hswish/gen/neon-x4.c
+++ b/src/f32-hswish/gen/neon-x4.c
@@ -25,37 +25,34 @@
assert(n % sizeof(float) == 0);
const float32x4_t vsixth = vld1q_dup_f32(¶ms->scalar.sixth);
- const float32x4_t vhalf = vld1q_dup_f32(¶ms->scalar.half);
- const float32x4_t vone = vld1q_dup_f32(¶ms->scalar.one);
- const float32x4_t vzero = vdupq_n_f32(0.0f);
+ const float32x4_t vthree = vld1q_dup_f32(¶ms->scalar.three);
+ const int32x4_t vsix = vreinterpretq_s32_f32(vld1q_dup_f32(¶ms->scalar.six));
+ const int32x4_t vzero = vdupq_n_s32(0);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const float32x4_t vx0123 = vld1q_f32(x); x += 4;
-
- float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
-
- vacc0123 = vmaxq_f32(vacc0123, vzero);
-
- vacc0123 = vminq_f32(vacc0123, vone);
-
- vacc0123 = vmulq_f32(vacc0123, vx0123);
-
- vst1q_f32(y, vacc0123); y += 4;
+ float32x4_t vx = vld1q_f32(x); x += 4;
+ float32x4_t vacc = vaddq_f32(vx, vthree);
+ vx = vmulq_f32(vx, vsixth);
+ vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
+ vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
+ vacc = vmulq_f32(vacc, vx);
+ vst1q_f32(y, vacc); y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const float32x4_t vx0123 = vld1q_f32(x);
- float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
- vacc0123 = vmaxq_f32(vacc0123, vzero);
- vacc0123 = vminq_f32(vacc0123, vone);
- vacc0123 = vmulq_f32(vacc0123, vx0123);
+ float32x4_t vx = vld1q_f32(x);
+ float32x4_t vacc = vaddq_f32(vx, vthree);
+ vx = vmulq_f32(vx, vsixth);
+ vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
+ vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
+ vacc = vmulq_f32(vacc, vx);
- float32x2_t vacc01 = vget_low_f32(vacc0123);
+ float32x2_t vacc_lo = vget_low_f32(vacc);
if (n & (2 * sizeof(float))) {
- vst1_f32(y, vacc01); y += 2;
- vacc01 = vget_high_f32(vacc0123);
+ vst1_f32(y, vacc_lo); y += 2;
+ vacc_lo = vget_high_f32(vacc);
}
if (n & (1 * sizeof(float))) {
- vst1_lane_f32(y, vacc01, 0);
+ vst1_lane_f32(y, vacc_lo, 0);
}
}
}
diff --git a/src/f32-hswish/gen/neon-x8.c b/src/f32-hswish/gen/neon-x8.c
index 825808c..f949f8b 100644
--- a/src/f32-hswish/gen/neon-x8.c
+++ b/src/f32-hswish/gen/neon-x8.c
@@ -25,22 +25,24 @@
assert(n % sizeof(float) == 0);
const float32x4_t vsixth = vld1q_dup_f32(¶ms->scalar.sixth);
- const float32x4_t vhalf = vld1q_dup_f32(¶ms->scalar.half);
- const float32x4_t vone = vld1q_dup_f32(¶ms->scalar.one);
- const float32x4_t vzero = vdupq_n_f32(0.0f);
+ const float32x4_t vthree = vld1q_dup_f32(¶ms->scalar.three);
+ const int32x4_t vsix = vreinterpretq_s32_f32(vld1q_dup_f32(¶ms->scalar.six));
+ const int32x4_t vzero = vdupq_n_s32(0);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
- const float32x4_t vx0123 = vld1q_f32(x); x += 4;
- const float32x4_t vx4567 = vld1q_f32(x); x += 4;
+ float32x4_t vx0123 = vld1q_f32(x); x += 4;
+ float32x4_t vx4567 = vld1q_f32(x); x += 4;
- float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
- float32x4_t vacc4567 = vmlaq_f32(vhalf, vx4567, vsixth);
+ float32x4_t vacc0123 = vaddq_f32(vx0123, vthree);
+ vx0123 = vmulq_f32(vx0123, vsixth);
+ float32x4_t vacc4567 = vaddq_f32(vx4567, vthree);
+ vx4567 = vmulq_f32(vx4567, vsixth);
- vacc0123 = vmaxq_f32(vacc0123, vzero);
- vacc4567 = vmaxq_f32(vacc4567, vzero);
+ vacc0123 = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc0123), vzero));
+ vacc4567 = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc4567), vzero));
- vacc0123 = vminq_f32(vacc0123, vone);
- vacc4567 = vminq_f32(vacc4567, vone);
+ vacc0123 = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc0123), vsix));
+ vacc4567 = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc4567), vsix));
vacc0123 = vmulq_f32(vacc0123, vx0123);
vacc4567 = vmulq_f32(vacc4567, vx4567);
@@ -49,27 +51,29 @@
vst1q_f32(y, vacc4567); y += 4;
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const float32x4_t vx0123 = vld1q_f32(x); x += 4;
- float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
- vacc0123 = vmaxq_f32(vacc0123, vzero);
- vacc0123 = vminq_f32(vacc0123, vone);
- vacc0123 = vmulq_f32(vacc0123, vx0123);
- vst1q_f32(y, vacc0123); y += 4;
+ float32x4_t vx = vld1q_f32(x); x += 4;
+ float32x4_t vacc = vaddq_f32(vx, vthree);
+ vx = vmulq_f32(vx, vsixth);
+ vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
+ vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
+ vacc = vmulq_f32(vacc, vx);
+ vst1q_f32(y, vacc); y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const float32x4_t vx0123 = vld1q_f32(x);
- float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
- vacc0123 = vmaxq_f32(vacc0123, vzero);
- vacc0123 = vminq_f32(vacc0123, vone);
- vacc0123 = vmulq_f32(vacc0123, vx0123);
+ float32x4_t vx = vld1q_f32(x);
+ float32x4_t vacc = vaddq_f32(vx, vthree);
+ vx = vmulq_f32(vx, vsixth);
+ vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
+ vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
+ vacc = vmulq_f32(vacc, vx);
- float32x2_t vacc01 = vget_low_f32(vacc0123);
+ float32x2_t vacc_lo = vget_low_f32(vacc);
if (n & (2 * sizeof(float))) {
- vst1_f32(y, vacc01); y += 2;
- vacc01 = vget_high_f32(vacc0123);
+ vst1_f32(y, vacc_lo); y += 2;
+ vacc_lo = vget_high_f32(vacc);
}
if (n & (1 * sizeof(float))) {
- vst1_lane_f32(y, vacc01, 0);
+ vst1_lane_f32(y, vacc_lo, 0);
}
}
}
diff --git a/src/f32-hswish/gen/neonfma-x4.c b/src/f32-hswish/gen/neonfma-x4.c
deleted file mode 100644
index b31510e..0000000
--- a/src/f32-hswish/gen/neonfma-x4.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-hswish/neon.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__neonfma_x4(
- size_t n,
- const float* x,
- float* y,
- const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
- assert(n != 0);
- assert(n % sizeof(float) == 0);
-
- const float32x4_t vsixth = vld1q_dup_f32(¶ms->scalar.sixth);
- const float32x4_t vhalf = vld1q_dup_f32(¶ms->scalar.half);
- const float32x4_t vone = vld1q_dup_f32(¶ms->scalar.one);
- const float32x4_t vzero = vdupq_n_f32(0.0f);
-
- for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const float32x4_t vx0123 = vld1q_f32(x); x += 4;
-
- float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
-
- vacc0123 = vmaxq_f32(vacc0123, vzero);
-
- vacc0123 = vminq_f32(vacc0123, vone);
-
- vacc0123 = vmulq_f32(vacc0123, vx0123);
-
- vst1q_f32(y, vacc0123); y += 4;
- }
- if XNN_UNLIKELY(n != 0) {
- const float32x4_t vx0123 = vld1q_f32(x);
- float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
- vacc0123 = vmaxq_f32(vacc0123, vzero);
- vacc0123 = vminq_f32(vacc0123, vone);
- vacc0123 = vmulq_f32(vacc0123, vx0123);
-
- float32x2_t vacc01 = vget_low_f32(vacc0123);
- if (n & (2 * sizeof(float))) {
- vst1_f32(y, vacc01); y += 2;
- vacc01 = vget_high_f32(vacc0123);
- }
- if (n & (1 * sizeof(float))) {
- vst1_lane_f32(y, vacc01, 0);
- }
- }
-}
diff --git a/src/f32-hswish/gen/neonfma-x8.c b/src/f32-hswish/gen/neonfma-x8.c
deleted file mode 100644
index 6371884..0000000
--- a/src/f32-hswish/gen/neonfma-x8.c
+++ /dev/null
@@ -1,75 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-hswish/neon.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__neonfma_x8(
- size_t n,
- const float* x,
- float* y,
- const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
- assert(n != 0);
- assert(n % sizeof(float) == 0);
-
- const float32x4_t vsixth = vld1q_dup_f32(¶ms->scalar.sixth);
- const float32x4_t vhalf = vld1q_dup_f32(¶ms->scalar.half);
- const float32x4_t vone = vld1q_dup_f32(¶ms->scalar.one);
- const float32x4_t vzero = vdupq_n_f32(0.0f);
-
- for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
- const float32x4_t vx0123 = vld1q_f32(x); x += 4;
- const float32x4_t vx4567 = vld1q_f32(x); x += 4;
-
- float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
- float32x4_t vacc4567 = vfmaq_f32(vhalf, vx4567, vsixth);
-
- vacc0123 = vmaxq_f32(vacc0123, vzero);
- vacc4567 = vmaxq_f32(vacc4567, vzero);
-
- vacc0123 = vminq_f32(vacc0123, vone);
- vacc4567 = vminq_f32(vacc4567, vone);
-
- vacc0123 = vmulq_f32(vacc0123, vx0123);
- vacc4567 = vmulq_f32(vacc4567, vx4567);
-
- vst1q_f32(y, vacc0123); y += 4;
- vst1q_f32(y, vacc4567); y += 4;
- }
- for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const float32x4_t vx0123 = vld1q_f32(x); x += 4;
- float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
- vacc0123 = vmaxq_f32(vacc0123, vzero);
- vacc0123 = vminq_f32(vacc0123, vone);
- vacc0123 = vmulq_f32(vacc0123, vx0123);
- vst1q_f32(y, vacc0123); y += 4;
- }
- if XNN_UNLIKELY(n != 0) {
- const float32x4_t vx0123 = vld1q_f32(x);
- float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
- vacc0123 = vmaxq_f32(vacc0123, vzero);
- vacc0123 = vminq_f32(vacc0123, vone);
- vacc0123 = vmulq_f32(vacc0123, vx0123);
-
- float32x2_t vacc01 = vget_low_f32(vacc0123);
- if (n & (2 * sizeof(float))) {
- vst1_f32(y, vacc01); y += 2;
- vacc01 = vget_high_f32(vacc0123);
- }
- if (n & (1 * sizeof(float))) {
- vst1_lane_f32(y, vacc01, 0);
- }
- }
-}
diff --git a/src/f32-hswish/gen/psimd-x4.c b/src/f32-hswish/gen/psimd-x4.c
deleted file mode 100644
index 302a9dc..0000000
--- a/src/f32-hswish/gen/psimd-x4.c
+++ /dev/null
@@ -1,63 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-hswish/psimd.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__psimd_x4(
- size_t n,
- const float* x,
- float* y,
- const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
- assert(n != 0);
- assert(n % sizeof(float) == 0);
-
- const psimd_f32 vsixth = psimd_load_splat_f32(¶ms->scalar.sixth);
- const psimd_f32 vhalf = psimd_load_splat_f32(¶ms->scalar.half);
- const psimd_f32 vone = psimd_load_splat_f32(¶ms->scalar.one);
- const psimd_f32 vzero = psimd_splat_f32(0.0f);
-
- for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const psimd_f32 vx0123 = psimd_load_f32(x);
- x += 4;
-
- psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
-
- vacc0123 = psimd_max_f32(vacc0123, vzero);
-
- vacc0123 = psimd_min_f32(vacc0123, vone);
-
- vacc0123 = psimd_mul_f32(vacc0123, vx0123);
-
- psimd_store_f32(y, vacc0123);
- y += 4;
- }
- if XNN_UNLIKELY(n != 0) {
- const psimd_f32 vx0123 = psimd_load_f32(x);
- psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
- vacc0123 = psimd_max_f32(vacc0123, vzero);
- vacc0123 = psimd_min_f32(vacc0123, vone);
- vacc0123 = psimd_mul_f32(vacc0123, vx0123);
-
- if (n & (2 * sizeof(float))) {
- psimd_store2_f32(y, vacc0123);
- vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
- y += 2;
- }
- if (n & (1 * sizeof(float))) {
- psimd_store1_f32(y, vacc0123);
- }
- }
-}
diff --git a/src/f32-hswish/gen/psimd-x8.c b/src/f32-hswish/gen/psimd-x8.c
deleted file mode 100644
index bb0135d..0000000
--- a/src/f32-hswish/gen/psimd-x8.c
+++ /dev/null
@@ -1,79 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-hswish/psimd.c.in
-// Generator: tools/xngen
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__psimd_x8(
- size_t n,
- const float* x,
- float* y,
- const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
- assert(n != 0);
- assert(n % sizeof(float) == 0);
-
- const psimd_f32 vsixth = psimd_load_splat_f32(¶ms->scalar.sixth);
- const psimd_f32 vhalf = psimd_load_splat_f32(¶ms->scalar.half);
- const psimd_f32 vone = psimd_load_splat_f32(¶ms->scalar.one);
- const psimd_f32 vzero = psimd_splat_f32(0.0f);
-
- for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
- const psimd_f32 vx0123 = psimd_load_f32(x);
- const psimd_f32 vx4567 = psimd_load_f32(x + 4);
- x += 8;
-
- psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
- psimd_f32 vacc4567 = psimd_qfma_f32(vhalf, vx4567, vsixth);
-
- vacc0123 = psimd_max_f32(vacc0123, vzero);
- vacc4567 = psimd_max_f32(vacc4567, vzero);
-
- vacc0123 = psimd_min_f32(vacc0123, vone);
- vacc4567 = psimd_min_f32(vacc4567, vone);
-
- vacc0123 = psimd_mul_f32(vacc0123, vx0123);
- vacc4567 = psimd_mul_f32(vacc4567, vx4567);
-
- psimd_store_f32(y, vacc0123);
- psimd_store_f32(y + 4, vacc4567);
- y += 8;
- }
- for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const psimd_f32 vx0123 = psimd_load_f32(x);
- x += 4;
- psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
- vacc0123 = psimd_max_f32(vacc0123, vzero);
- vacc0123 = psimd_min_f32(vacc0123, vone);
- vacc0123 = psimd_mul_f32(vacc0123, vx0123);
- psimd_store_f32(y, vacc0123);
- y += 4;
- }
- if XNN_UNLIKELY(n != 0) {
- const psimd_f32 vx0123 = psimd_load_f32(x);
- psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
- vacc0123 = psimd_max_f32(vacc0123, vzero);
- vacc0123 = psimd_min_f32(vacc0123, vone);
- vacc0123 = psimd_mul_f32(vacc0123, vx0123);
-
- if (n & (2 * sizeof(float))) {
- psimd_store2_f32(y, vacc0123);
- vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
- y += 2;
- }
- if (n & (1 * sizeof(float))) {
- psimd_store1_f32(y, vacc0123);
- }
- }
-}
diff --git a/src/f32-hswish/gen/scalar-x1.c b/src/f32-hswish/gen/scalar-x1.c
index fae0788..c93e22b 100644
--- a/src/f32-hswish/gen/scalar-x1.c
+++ b/src/f32-hswish/gen/scalar-x1.c
@@ -24,17 +24,19 @@
assert(n % sizeof(float) == 0);
const float vsixth = params->scalar.sixth;
- const float vhalf = params->scalar.half;
- const float vone = params->scalar.one;
- assert(vhalf == 0.5f);
- assert(vone == 1.0f);
+ const float vthree = params->scalar.three;
+ const float vsix = params->scalar.six;
+ const float vzero = 0.0f;
+ assert(vthree == 3.0f);
+ assert(vsix == 6.0f);
for (; n >= sizeof(float); n -= sizeof(float)) {
- const float vx = *x++;
- float vacc = vx * vsixth + vhalf;
- vacc = math_max_f32(vacc, 0.0f);
- vacc = math_min_f32(vacc, vone);
- vacc = vacc * vx;
+ float vx = *x++;
+ float vacc = vx + vthree;
+ vx *= vsixth;
+ vacc = math_max_f32(vacc, vzero);
+ vacc = math_min_f32(vacc, vsix);
+ vacc *= vx;
*y++ = vacc;
}
}
diff --git a/src/f32-hswish/gen/scalar-x2.c b/src/f32-hswish/gen/scalar-x2.c
index 0b879ea..d84310e 100644
--- a/src/f32-hswish/gen/scalar-x2.c
+++ b/src/f32-hswish/gen/scalar-x2.c
@@ -24,24 +24,27 @@
assert(n % sizeof(float) == 0);
const float vsixth = params->scalar.sixth;
- const float vhalf = params->scalar.half;
- const float vone = params->scalar.one;
- assert(vhalf == 0.5f);
- assert(vone == 1.0f);
+ const float vthree = params->scalar.three;
+ const float vsix = params->scalar.six;
+ const float vzero = 0.0f;
+ assert(vthree == 3.0f);
+ assert(vsix == 6.0f);
for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
- const float vx0 = x[0];
- const float vx1 = x[1];
+ float vx0 = x[0];
+ float vx1 = x[1];
x += 2;
- float vacc0 = vx0 * vsixth + vhalf;
- float vacc1 = vx1 * vsixth + vhalf;
+ float vacc0 = vx0 + vthree;
+ vx0 *= vsixth;
+ float vacc1 = vx1 + vthree;
+ vx1 *= vsixth;
- vacc0 = math_max_f32(vacc0, 0.0f);
- vacc1 = math_max_f32(vacc1, 0.0f);
+ vacc0 = math_max_f32(vacc0, vzero);
+ vacc1 = math_max_f32(vacc1, vzero);
- vacc0 = math_min_f32(vacc0, vone);
- vacc1 = math_min_f32(vacc1, vone);
+ vacc0 = math_min_f32(vacc0, vsix);
+ vacc1 = math_min_f32(vacc1, vsix);
vacc0 *= vx0;
vacc1 *= vx1;
@@ -51,11 +54,12 @@
y += 2;
}
if XNN_UNLIKELY(n != 0) {
- const float vx = *x;
- float vacc = vx * vsixth + vhalf;
- vacc = math_max_f32(vacc, 0.0f);
- vacc = math_min_f32(vacc, vone);
- vacc = vacc * vx;
+ float vx = *x;
+ float vacc = vx + vthree;
+ vx *= vsixth;
+ vacc = math_max_f32(vacc, vzero);
+ vacc = math_min_f32(vacc, vsix);
+ vacc *= vx;
*y = vacc;
}
}
diff --git a/src/f32-hswish/gen/scalar-x4.c b/src/f32-hswish/gen/scalar-x4.c
index 23f77c0..6ef7dfc 100644
--- a/src/f32-hswish/gen/scalar-x4.c
+++ b/src/f32-hswish/gen/scalar-x4.c
@@ -24,32 +24,37 @@
assert(n % sizeof(float) == 0);
const float vsixth = params->scalar.sixth;
- const float vhalf = params->scalar.half;
- const float vone = params->scalar.one;
- assert(vhalf == 0.5f);
- assert(vone == 1.0f);
+ const float vthree = params->scalar.three;
+ const float vsix = params->scalar.six;
+ const float vzero = 0.0f;
+ assert(vthree == 3.0f);
+ assert(vsix == 6.0f);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const float vx0 = x[0];
- const float vx1 = x[1];
- const float vx2 = x[2];
- const float vx3 = x[3];
+ float vx0 = x[0];
+ float vx1 = x[1];
+ float vx2 = x[2];
+ float vx3 = x[3];
x += 4;
- float vacc0 = vx0 * vsixth + vhalf;
- float vacc1 = vx1 * vsixth + vhalf;
- float vacc2 = vx2 * vsixth + vhalf;
- float vacc3 = vx3 * vsixth + vhalf;
+ float vacc0 = vx0 + vthree;
+ vx0 *= vsixth;
+ float vacc1 = vx1 + vthree;
+ vx1 *= vsixth;
+ float vacc2 = vx2 + vthree;
+ vx2 *= vsixth;
+ float vacc3 = vx3 + vthree;
+ vx3 *= vsixth;
- vacc0 = math_max_f32(vacc0, 0.0f);
- vacc1 = math_max_f32(vacc1, 0.0f);
- vacc2 = math_max_f32(vacc2, 0.0f);
- vacc3 = math_max_f32(vacc3, 0.0f);
+ vacc0 = math_max_f32(vacc0, vzero);
+ vacc1 = math_max_f32(vacc1, vzero);
+ vacc2 = math_max_f32(vacc2, vzero);
+ vacc3 = math_max_f32(vacc3, vzero);
- vacc0 = math_min_f32(vacc0, vone);
- vacc1 = math_min_f32(vacc1, vone);
- vacc2 = math_min_f32(vacc2, vone);
- vacc3 = math_min_f32(vacc3, vone);
+ vacc0 = math_min_f32(vacc0, vsix);
+ vacc1 = math_min_f32(vacc1, vsix);
+ vacc2 = math_min_f32(vacc2, vsix);
+ vacc3 = math_min_f32(vacc3, vsix);
vacc0 *= vx0;
vacc1 *= vx1;
@@ -64,11 +69,12 @@
}
if XNN_UNLIKELY(n != 0) {
do {
- const float vx = *x++;
- float vacc = vx * vsixth + vhalf;
- vacc = math_max_f32(vacc, 0.0f);
- vacc = math_min_f32(vacc, vone);
- vacc = vacc * vx;
+ float vx = *x++;
+ float vacc = vx + vthree;
+ vx *= vsixth;
+ vacc = math_max_f32(vacc, vzero);
+ vacc = math_min_f32(vacc, vsix);
+ vacc *= vx;
*y++ = vacc;
n -= sizeof(float);
} while (n != 0);
diff --git a/src/f32-hswish/gen/wasm-x1.c b/src/f32-hswish/gen/wasm-x1.c
index 9cb44c2..e3c520c 100644
--- a/src/f32-hswish/gen/wasm-x1.c
+++ b/src/f32-hswish/gen/wasm-x1.c
@@ -24,17 +24,19 @@
assert(n % sizeof(float) == 0);
const float vsixth = params->scalar.sixth;
- const float vhalf = params->scalar.half;
- const float vone = params->scalar.one;
- assert(vhalf == 0.5f);
- assert(vone == 1.0f);
+ const float vthree = params->scalar.three;
+ const float vsix = params->scalar.six;
+ const float vzero = 0.0f;
+ assert(vthree == 3.0f);
+ assert(vsix == 6.0f);
for (; n >= sizeof(float); n -= sizeof(float)) {
- const float vx = *x++;
- float vacc = vx * vsixth + vhalf;
- vacc = __builtin_wasm_max_f32(vacc, 0.0f);
- vacc = __builtin_wasm_min_f32(vacc, vone);
- vacc = vacc * vx;
+ float vx = *x++;
+ float vacc = vx + vthree;
+ vx *= vsixth;
+ vacc = __builtin_wasm_max_f32(vacc, vzero);
+ vacc = __builtin_wasm_min_f32(vacc, vsix);
+ vacc *= vx;
*y++ = vacc;
}
}
diff --git a/src/f32-hswish/gen/wasm-x2.c b/src/f32-hswish/gen/wasm-x2.c
index 531dece..ed8bebb 100644
--- a/src/f32-hswish/gen/wasm-x2.c
+++ b/src/f32-hswish/gen/wasm-x2.c
@@ -24,24 +24,27 @@
assert(n % sizeof(float) == 0);
const float vsixth = params->scalar.sixth;
- const float vhalf = params->scalar.half;
- const float vone = params->scalar.one;
- assert(vhalf == 0.5f);
- assert(vone == 1.0f);
+ const float vthree = params->scalar.three;
+ const float vsix = params->scalar.six;
+ const float vzero = 0.0f;
+ assert(vthree == 3.0f);
+ assert(vsix == 6.0f);
for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
- const float vx0 = x[0];
- const float vx1 = x[1];
+ float vx0 = x[0];
+ float vx1 = x[1];
x += 2;
- float vacc0 = vx0 * vsixth + vhalf;
- float vacc1 = vx1 * vsixth + vhalf;
+ float vacc0 = vx0 + vthree;
+ vx0 *= vsixth;
+ float vacc1 = vx1 + vthree;
+ vx1 *= vsixth;
- vacc0 = __builtin_wasm_max_f32(vacc0, 0.0f);
- vacc1 = __builtin_wasm_max_f32(vacc1, 0.0f);
+ vacc0 = __builtin_wasm_max_f32(vacc0, vzero);
+ vacc1 = __builtin_wasm_max_f32(vacc1, vzero);
- vacc0 = __builtin_wasm_min_f32(vacc0, vone);
- vacc1 = __builtin_wasm_min_f32(vacc1, vone);
+ vacc0 = __builtin_wasm_min_f32(vacc0, vsix);
+ vacc1 = __builtin_wasm_min_f32(vacc1, vsix);
vacc0 *= vx0;
vacc1 *= vx1;
@@ -51,11 +54,12 @@
y += 2;
}
if XNN_UNLIKELY(n != 0) {
- const float vx = *x;
- float vacc = vx * vsixth + vhalf;
- vacc = __builtin_wasm_max_f32(vacc, 0.0f);
- vacc = __builtin_wasm_min_f32(vacc, vone);
- vacc = vacc * vx;
+ float vx = *x;
+ float vacc = vx + vthree;
+ vx *= vsixth;
+ vacc = __builtin_wasm_max_f32(vacc, vzero);
+ vacc = __builtin_wasm_min_f32(vacc, vsix);
+ vacc *= vx;
*y = vacc;
}
}
diff --git a/src/f32-hswish/gen/wasm-x4.c b/src/f32-hswish/gen/wasm-x4.c
index d31ea04..696d055 100644
--- a/src/f32-hswish/gen/wasm-x4.c
+++ b/src/f32-hswish/gen/wasm-x4.c
@@ -24,32 +24,37 @@
assert(n % sizeof(float) == 0);
const float vsixth = params->scalar.sixth;
- const float vhalf = params->scalar.half;
- const float vone = params->scalar.one;
- assert(vhalf == 0.5f);
- assert(vone == 1.0f);
+ const float vthree = params->scalar.three;
+ const float vsix = params->scalar.six;
+ const float vzero = 0.0f;
+ assert(vthree == 3.0f);
+ assert(vsix == 6.0f);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const float vx0 = x[0];
- const float vx1 = x[1];
- const float vx2 = x[2];
- const float vx3 = x[3];
+ float vx0 = x[0];
+ float vx1 = x[1];
+ float vx2 = x[2];
+ float vx3 = x[3];
x += 4;
- float vacc0 = vx0 * vsixth + vhalf;
- float vacc1 = vx1 * vsixth + vhalf;
- float vacc2 = vx2 * vsixth + vhalf;
- float vacc3 = vx3 * vsixth + vhalf;
+ float vacc0 = vx0 + vthree;
+ vx0 *= vsixth;
+ float vacc1 = vx1 + vthree;
+ vx1 *= vsixth;
+ float vacc2 = vx2 + vthree;
+ vx2 *= vsixth;
+ float vacc3 = vx3 + vthree;
+ vx3 *= vsixth;
- vacc0 = __builtin_wasm_max_f32(vacc0, 0.0f);
- vacc1 = __builtin_wasm_max_f32(vacc1, 0.0f);
- vacc2 = __builtin_wasm_max_f32(vacc2, 0.0f);
- vacc3 = __builtin_wasm_max_f32(vacc3, 0.0f);
+ vacc0 = __builtin_wasm_max_f32(vacc0, vzero);
+ vacc1 = __builtin_wasm_max_f32(vacc1, vzero);
+ vacc2 = __builtin_wasm_max_f32(vacc2, vzero);
+ vacc3 = __builtin_wasm_max_f32(vacc3, vzero);
- vacc0 = __builtin_wasm_min_f32(vacc0, vone);
- vacc1 = __builtin_wasm_min_f32(vacc1, vone);
- vacc2 = __builtin_wasm_min_f32(vacc2, vone);
- vacc3 = __builtin_wasm_min_f32(vacc3, vone);
+ vacc0 = __builtin_wasm_min_f32(vacc0, vsix);
+ vacc1 = __builtin_wasm_min_f32(vacc1, vsix);
+ vacc2 = __builtin_wasm_min_f32(vacc2, vsix);
+ vacc3 = __builtin_wasm_min_f32(vacc3, vsix);
vacc0 *= vx0;
vacc1 *= vx1;
@@ -64,11 +69,12 @@
}
if XNN_UNLIKELY(n != 0) {
do {
- const float vx = *x++;
- float vacc = vx * vsixth + vhalf;
- vacc = __builtin_wasm_max_f32(vacc, 0.0f);
- vacc = __builtin_wasm_min_f32(vacc, vone);
- vacc = vacc * vx;
+ float vx = *x++;
+ float vacc = vx + vthree;
+ vx *= vsixth;
+ vacc = __builtin_wasm_max_f32(vacc, vzero);
+ vacc = __builtin_wasm_min_f32(vacc, vsix);
+ vacc *= vx;
*y++ = vacc;
n -= sizeof(float);
} while (n != 0);
diff --git a/src/f32-hswish/gen/wasmsimd-arm-x16.c b/src/f32-hswish/gen/wasmsimd-arm-x16.c
deleted file mode 100644
index 1280245..0000000
--- a/src/f32-hswish/gen/wasmsimd-arm-x16.c
+++ /dev/null
@@ -1,94 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-hswish/wasmsimd.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__wasmsimd_arm_x16(
- size_t n,
- const float* x,
- float* y,
- const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
- assert(n != 0);
- assert(n % sizeof(float) == 0);
-
- const v128_t vsixth = wasm_v32x4_load_splat(¶ms->scalar.sixth);
- const v128_t vhalf = wasm_v32x4_load_splat(¶ms->scalar.half);
- const v128_t vone = wasm_v32x4_load_splat(¶ms->scalar.one);
- const v128_t vzero = wasm_f32x4_splat(0.0f);
-
- for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
- const v128_t vx0123 = wasm_v128_load(x);
- const v128_t vx4567 = wasm_v128_load(x + 4);
- const v128_t vx89AB = wasm_v128_load(x + 8);
- const v128_t vxCDEF = wasm_v128_load(x + 12);
- x += 16;
-
- v128_t vacc0123 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx0123, vsixth));
- v128_t vacc4567 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx4567, vsixth));
- v128_t vacc89AB = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx89AB, vsixth));
- v128_t vaccCDEF = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vxCDEF, vsixth));
-
- vacc0123 = wasm_f32x4_max(vacc0123, vzero);
- vacc4567 = wasm_f32x4_max(vacc4567, vzero);
- vacc89AB = wasm_f32x4_max(vacc89AB, vzero);
- vaccCDEF = wasm_f32x4_max(vaccCDEF, vzero);
-
- vacc0123 = wasm_f32x4_min(vacc0123, vone);
- vacc4567 = wasm_f32x4_min(vacc4567, vone);
- vacc89AB = wasm_f32x4_min(vacc89AB, vone);
- vaccCDEF = wasm_f32x4_min(vaccCDEF, vone);
-
- vacc0123 = wasm_f32x4_mul(vacc0123, vx0123);
- vacc4567 = wasm_f32x4_mul(vacc4567, vx4567);
- vacc89AB = wasm_f32x4_mul(vacc89AB, vx89AB);
- vaccCDEF = wasm_f32x4_mul(vaccCDEF, vxCDEF);
-
- wasm_v128_store(y, vacc0123);
- wasm_v128_store(y + 4, vacc4567);
- wasm_v128_store(y + 8, vacc89AB);
- wasm_v128_store(y + 12, vaccCDEF);
- y += 16;
- }
- for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const v128_t vx = wasm_v128_load(x);
- x += 4;
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
- vacc = wasm_f32x4_max(vacc, vzero);
- vacc = wasm_f32x4_min(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
-
- wasm_v128_store(y, vacc);
- y += 4;
- }
- if XNN_UNLIKELY(n != 0) {
- const v128_t vx = wasm_v128_load(x);
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
- vacc = wasm_f32x4_max(vacc, vzero);
- vacc = wasm_f32x4_min(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
-
- if (n & (2 * sizeof(float))) {
- *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
- vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
- y += 2;
- }
- if (n & (1 * sizeof(float))) {
- *y = wasm_f32x4_extract_lane(vacc, 0);
- }
- }
-}
diff --git a/src/f32-hswish/gen/wasmsimd-arm-x8.c b/src/f32-hswish/gen/wasmsimd-arm-x8.c
deleted file mode 100644
index 7cebce8..0000000
--- a/src/f32-hswish/gen/wasmsimd-arm-x8.c
+++ /dev/null
@@ -1,82 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-hswish/wasmsimd.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__wasmsimd_arm_x8(
- size_t n,
- const float* x,
- float* y,
- const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
- assert(n != 0);
- assert(n % sizeof(float) == 0);
-
- const v128_t vsixth = wasm_v32x4_load_splat(¶ms->scalar.sixth);
- const v128_t vhalf = wasm_v32x4_load_splat(¶ms->scalar.half);
- const v128_t vone = wasm_v32x4_load_splat(¶ms->scalar.one);
- const v128_t vzero = wasm_f32x4_splat(0.0f);
-
- for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
- const v128_t vx0123 = wasm_v128_load(x);
- const v128_t vx4567 = wasm_v128_load(x + 4);
- x += 8;
-
- v128_t vacc0123 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx0123, vsixth));
- v128_t vacc4567 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx4567, vsixth));
-
- vacc0123 = wasm_f32x4_max(vacc0123, vzero);
- vacc4567 = wasm_f32x4_max(vacc4567, vzero);
-
- vacc0123 = wasm_f32x4_min(vacc0123, vone);
- vacc4567 = wasm_f32x4_min(vacc4567, vone);
-
- vacc0123 = wasm_f32x4_mul(vacc0123, vx0123);
- vacc4567 = wasm_f32x4_mul(vacc4567, vx4567);
-
- wasm_v128_store(y, vacc0123);
- wasm_v128_store(y + 4, vacc4567);
- y += 8;
- }
- for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const v128_t vx = wasm_v128_load(x);
- x += 4;
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
- vacc = wasm_f32x4_max(vacc, vzero);
- vacc = wasm_f32x4_min(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
-
- wasm_v128_store(y, vacc);
- y += 4;
- }
- if XNN_UNLIKELY(n != 0) {
- const v128_t vx = wasm_v128_load(x);
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
- vacc = wasm_f32x4_max(vacc, vzero);
- vacc = wasm_f32x4_min(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
-
- if (n & (2 * sizeof(float))) {
- *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
- vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
- y += 2;
- }
- if (n & (1 * sizeof(float))) {
- *y = wasm_f32x4_extract_lane(vacc, 0);
- }
- }
-}
diff --git a/src/f32-hswish/gen/wasmsimd-x16.c b/src/f32-hswish/gen/wasmsimd-x16.c
new file mode 100644
index 0000000..dcbcd77
--- /dev/null
+++ b/src/f32-hswish/gen/wasmsimd-x16.c
@@ -0,0 +1,100 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-hswish/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/hswish.h>
+
+
+void xnn_f32_hswish_ukernel__wasmsimd_x16(
+ size_t n,
+ const float* x,
+ float* y,
+ const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+
+ const v128_t vsixth = wasm_v32x4_load_splat(¶ms->scalar.sixth);
+ const v128_t vthree = wasm_v32x4_load_splat(¶ms->scalar.three);
+ const v128_t vsix = wasm_v32x4_load_splat(¶ms->scalar.six);
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+ for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+ v128_t vx0123 = wasm_v128_load(x);
+ v128_t vx4567 = wasm_v128_load(x + 4);
+ v128_t vx89AB = wasm_v128_load(x + 8);
+ v128_t vxCDEF = wasm_v128_load(x + 12);
+ x += 16;
+
+ v128_t vacc0123 = wasm_f32x4_add(vx0123, vthree);
+ vx0123 = wasm_f32x4_mul(vx0123, vsixth);
+ v128_t vacc4567 = wasm_f32x4_add(vx4567, vthree);
+ vx4567 = wasm_f32x4_mul(vx4567, vsixth);
+ v128_t vacc89AB = wasm_f32x4_add(vx89AB, vthree);
+ vx89AB = wasm_f32x4_mul(vx89AB, vsixth);
+ v128_t vaccCDEF = wasm_f32x4_add(vxCDEF, vthree);
+ vxCDEF = wasm_f32x4_mul(vxCDEF, vsixth);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vzero);
+ vacc4567 = wasm_i32x4_max(vacc4567, vzero);
+ vacc89AB = wasm_i32x4_max(vacc89AB, vzero);
+ vaccCDEF = wasm_i32x4_max(vaccCDEF, vzero);
+
+ vacc0123 = wasm_i32x4_min(vacc0123, vsix);
+ vacc4567 = wasm_i32x4_min(vacc4567, vsix);
+ vacc89AB = wasm_i32x4_min(vacc89AB, vsix);
+ vaccCDEF = wasm_i32x4_min(vaccCDEF, vsix);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vx0123);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vx4567);
+ vacc89AB = wasm_f32x4_mul(vacc89AB, vx89AB);
+ vaccCDEF = wasm_f32x4_mul(vaccCDEF, vxCDEF);
+
+ wasm_v128_store(y, vacc0123);
+ wasm_v128_store(y + 4, vacc4567);
+ wasm_v128_store(y + 8, vacc89AB);
+ wasm_v128_store(y + 12, vaccCDEF);
+ y += 16;
+ }
+ for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+ v128_t vx = wasm_v128_load(x);
+ x += 4;
+
+ v128_t vacc = wasm_f32x4_add(vx, vthree);
+ vx = wasm_f32x4_mul(vx, vsixth);
+ vacc = wasm_i32x4_max(vacc, vzero);
+ vacc = wasm_i32x4_min(vacc, vsix);
+ vacc = wasm_f32x4_mul(vacc, vx);
+
+ wasm_v128_store(y, vacc);
+ y += 4;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ v128_t vx = wasm_v128_load(x);
+
+ v128_t vacc = wasm_f32x4_add(vx, vthree);
+ vx = wasm_f32x4_mul(vx, vsixth);
+ vacc = wasm_i32x4_max(vacc, vzero);
+ vacc = wasm_i32x4_min(vacc, vsix);
+ vacc = wasm_f32x4_mul(vacc, vx);
+
+ if (n & (2 * sizeof(float))) {
+ *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+ vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+ y += 2;
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = wasm_f32x4_extract_lane(vacc, 0);
+ }
+ }
+}
diff --git a/src/f32-hswish/gen/wasmsimd-arm-x4.c b/src/f32-hswish/gen/wasmsimd-x4.c
similarity index 67%
rename from src/f32-hswish/gen/wasmsimd-arm-x4.c
rename to src/f32-hswish/gen/wasmsimd-x4.c
index bc9ba3c..0e63426 100644
--- a/src/f32-hswish/gen/wasmsimd-arm-x4.c
+++ b/src/f32-hswish/gen/wasmsimd-x4.c
@@ -15,7 +15,7 @@
#include <xnnpack/hswish.h>
-void xnn_f32_hswish_ukernel__wasmsimd_arm_x4(
+void xnn_f32_hswish_ukernel__wasmsimd_x4(
size_t n,
const float* x,
float* y,
@@ -25,28 +25,30 @@
assert(n % sizeof(float) == 0);
const v128_t vsixth = wasm_v32x4_load_splat(¶ms->scalar.sixth);
- const v128_t vhalf = wasm_v32x4_load_splat(¶ms->scalar.half);
- const v128_t vone = wasm_v32x4_load_splat(¶ms->scalar.one);
+ const v128_t vthree = wasm_v32x4_load_splat(¶ms->scalar.three);
+ const v128_t vsix = wasm_v32x4_load_splat(¶ms->scalar.six);
const v128_t vzero = wasm_f32x4_splat(0.0f);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const v128_t vx = wasm_v128_load(x);
+ v128_t vx = wasm_v128_load(x);
x += 4;
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
- vacc = wasm_f32x4_max(vacc, vzero);
- vacc = wasm_f32x4_min(vacc, vone);
+ v128_t vacc = wasm_f32x4_add(vx, vthree);
+ vx = wasm_f32x4_mul(vx, vsixth);
+ vacc = wasm_i32x4_max(vacc, vzero);
+ vacc = wasm_i32x4_min(vacc, vsix);
vacc = wasm_f32x4_mul(vacc, vx);
wasm_v128_store(y, vacc);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const v128_t vx = wasm_v128_load(x);
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
+ v128_t vx = wasm_v128_load(x);
- vacc = wasm_f32x4_max(vacc, vzero);
- vacc = wasm_f32x4_min(vacc, vone);
+ v128_t vacc = wasm_f32x4_add(vx, vthree);
+ vx = wasm_f32x4_mul(vx, vsixth);
+ vacc = wasm_i32x4_max(vacc, vzero);
+ vacc = wasm_i32x4_min(vacc, vsix);
vacc = wasm_f32x4_mul(vacc, vx);
if (n & (2 * sizeof(float))) {
diff --git a/src/f32-hswish/gen/wasmsimd-x8.c b/src/f32-hswish/gen/wasmsimd-x8.c
new file mode 100644
index 0000000..a078759
--- /dev/null
+++ b/src/f32-hswish/gen/wasmsimd-x8.c
@@ -0,0 +1,86 @@
+// Auto-generated file. Do not edit!
+// Template: src/f32-hswish/wasmsimd.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/hswish.h>
+
+
+void xnn_f32_hswish_ukernel__wasmsimd_x8(
+ size_t n,
+ const float* x,
+ float* y,
+ const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(n != 0);
+ assert(n % sizeof(float) == 0);
+
+ const v128_t vsixth = wasm_v32x4_load_splat(¶ms->scalar.sixth);
+ const v128_t vthree = wasm_v32x4_load_splat(¶ms->scalar.three);
+ const v128_t vsix = wasm_v32x4_load_splat(¶ms->scalar.six);
+ const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+ for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+ v128_t vx0123 = wasm_v128_load(x);
+ v128_t vx4567 = wasm_v128_load(x + 4);
+ x += 8;
+
+ v128_t vacc0123 = wasm_f32x4_add(vx0123, vthree);
+ vx0123 = wasm_f32x4_mul(vx0123, vsixth);
+ v128_t vacc4567 = wasm_f32x4_add(vx4567, vthree);
+ vx4567 = wasm_f32x4_mul(vx4567, vsixth);
+
+ vacc0123 = wasm_i32x4_max(vacc0123, vzero);
+ vacc4567 = wasm_i32x4_max(vacc4567, vzero);
+
+ vacc0123 = wasm_i32x4_min(vacc0123, vsix);
+ vacc4567 = wasm_i32x4_min(vacc4567, vsix);
+
+ vacc0123 = wasm_f32x4_mul(vacc0123, vx0123);
+ vacc4567 = wasm_f32x4_mul(vacc4567, vx4567);
+
+ wasm_v128_store(y, vacc0123);
+ wasm_v128_store(y + 4, vacc4567);
+ y += 8;
+ }
+ for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+ v128_t vx = wasm_v128_load(x);
+ x += 4;
+
+ v128_t vacc = wasm_f32x4_add(vx, vthree);
+ vx = wasm_f32x4_mul(vx, vsixth);
+ vacc = wasm_i32x4_max(vacc, vzero);
+ vacc = wasm_i32x4_min(vacc, vsix);
+ vacc = wasm_f32x4_mul(vacc, vx);
+
+ wasm_v128_store(y, vacc);
+ y += 4;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ v128_t vx = wasm_v128_load(x);
+
+ v128_t vacc = wasm_f32x4_add(vx, vthree);
+ vx = wasm_f32x4_mul(vx, vsixth);
+ vacc = wasm_i32x4_max(vacc, vzero);
+ vacc = wasm_i32x4_min(vacc, vsix);
+ vacc = wasm_f32x4_mul(vacc, vx);
+
+ if (n & (2 * sizeof(float))) {
+ *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
+ vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
+ y += 2;
+ }
+ if (n & (1 * sizeof(float))) {
+ *y = wasm_f32x4_extract_lane(vacc, 0);
+ }
+ }
+}
diff --git a/src/f32-hswish/gen/wasmsimd-x86-x16.c b/src/f32-hswish/gen/wasmsimd-x86-x16.c
deleted file mode 100644
index 441a62d..0000000
--- a/src/f32-hswish/gen/wasmsimd-x86-x16.c
+++ /dev/null
@@ -1,106 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-hswish/wasmsimd.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__wasmsimd_x86_x16(
- size_t n,
- const float* x,
- float* y,
- const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
- assert(n != 0);
- assert(n % sizeof(float) == 0);
-
- const v128_t vsixth = wasm_v32x4_load_splat(¶ms->scalar.sixth);
- const v128_t vhalf = wasm_v32x4_load_splat(¶ms->scalar.half);
- const v128_t vone = wasm_v32x4_load_splat(¶ms->scalar.one);
- const v128_t vzero = wasm_f32x4_splat(0.0f);
-
- for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
- const v128_t vx0123 = wasm_v128_load(x);
- const v128_t vx4567 = wasm_v128_load(x + 4);
- const v128_t vx89AB = wasm_v128_load(x + 8);
- const v128_t vxCDEF = wasm_v128_load(x + 12);
- x += 16;
-
- v128_t vacc0123 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx0123, vsixth));
- v128_t vacc4567 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx4567, vsixth));
- v128_t vacc89AB = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx89AB, vsixth));
- v128_t vaccCDEF = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vxCDEF, vsixth));
-
- const v128_t vmasklt0123 = wasm_f32x4_lt(vacc0123, vzero);
- vacc0123 = wasm_v128_andnot(vacc0123, vmasklt0123);
- const v128_t vmasklt4567 = wasm_f32x4_lt(vacc4567, vzero);
- vacc4567 = wasm_v128_andnot(vacc4567, vmasklt4567);
- const v128_t vmasklt89AB = wasm_f32x4_lt(vacc89AB, vzero);
- vacc89AB = wasm_v128_andnot(vacc89AB, vmasklt89AB);
- const v128_t vmaskltCDEF = wasm_f32x4_lt(vaccCDEF, vzero);
- vaccCDEF = wasm_v128_andnot(vaccCDEF, vmaskltCDEF);
-
- const v128_t vmaskge0123 = wasm_f32x4_ge(vacc0123, vone);
- vacc0123 = wasm_f32x4_mul(vacc0123, vx0123);
- const v128_t vmaskge4567 = wasm_f32x4_ge(vacc4567, vone);
- vacc4567 = wasm_f32x4_mul(vacc4567, vx4567);
- const v128_t vmaskge89AB = wasm_f32x4_ge(vacc89AB, vone);
- vacc89AB = wasm_f32x4_mul(vacc89AB, vx89AB);
- const v128_t vmaskgeCDEF = wasm_f32x4_ge(vaccCDEF, vone);
- vaccCDEF = wasm_f32x4_mul(vaccCDEF, vxCDEF);
-
- vacc0123 = wasm_v128_bitselect(vx0123, vacc0123, vmaskge0123);
- vacc4567 = wasm_v128_bitselect(vx4567, vacc4567, vmaskge4567);
- vacc89AB = wasm_v128_bitselect(vx89AB, vacc89AB, vmaskge89AB);
- vaccCDEF = wasm_v128_bitselect(vxCDEF, vaccCDEF, vmaskgeCDEF);
-
- wasm_v128_store(y, vacc0123);
- wasm_v128_store(y + 4, vacc4567);
- wasm_v128_store(y + 8, vacc89AB);
- wasm_v128_store(y + 12, vaccCDEF);
- y += 16;
- }
- for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const v128_t vx = wasm_v128_load(x);
- x += 4;
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
- const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
- vacc = wasm_v128_andnot(vacc, vmasklt);
- const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
- vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-
- wasm_v128_store(y, vacc);
- y += 4;
- }
- if XNN_UNLIKELY(n != 0) {
- const v128_t vx = wasm_v128_load(x);
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
- const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
- vacc = wasm_v128_andnot(vacc, vmasklt);
- const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
- vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-
- if (n & (2 * sizeof(float))) {
- *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
- vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
- y += 2;
- }
- if (n & (1 * sizeof(float))) {
- *y = wasm_f32x4_extract_lane(vacc, 0);
- }
- }
-}
diff --git a/src/f32-hswish/gen/wasmsimd-x86-x4.c b/src/f32-hswish/gen/wasmsimd-x86-x4.c
deleted file mode 100644
index 335d787..0000000
--- a/src/f32-hswish/gen/wasmsimd-x86-x4.c
+++ /dev/null
@@ -1,65 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-hswish/wasmsimd.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__wasmsimd_x86_x4(
- size_t n,
- const float* x,
- float* y,
- const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
- assert(n != 0);
- assert(n % sizeof(float) == 0);
-
- const v128_t vsixth = wasm_v32x4_load_splat(¶ms->scalar.sixth);
- const v128_t vhalf = wasm_v32x4_load_splat(¶ms->scalar.half);
- const v128_t vone = wasm_v32x4_load_splat(¶ms->scalar.one);
- const v128_t vzero = wasm_f32x4_splat(0.0f);
-
- for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const v128_t vx = wasm_v128_load(x);
- x += 4;
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
- const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
- vacc = wasm_v128_andnot(vacc, vmasklt);
- const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
- vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-
- wasm_v128_store(y, vacc);
- y += 4;
- }
- if XNN_UNLIKELY(n != 0) {
- const v128_t vx = wasm_v128_load(x);
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
- const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
- vacc = wasm_v128_andnot(vacc, vmasklt);
- const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
- vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-
- if (n & (2 * sizeof(float))) {
- *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
- vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
- y += 2;
- }
- if (n & (1 * sizeof(float))) {
- *y = wasm_f32x4_extract_lane(vacc, 0);
- }
- }
-}
diff --git a/src/f32-hswish/gen/wasmsimd-x86-x8.c b/src/f32-hswish/gen/wasmsimd-x86-x8.c
deleted file mode 100644
index b3b6762..0000000
--- a/src/f32-hswish/gen/wasmsimd-x86-x8.c
+++ /dev/null
@@ -1,90 +0,0 @@
-// Auto-generated file. Do not edit!
-// Template: src/f32-hswish/wasmsimd.c.in
-// Generator: tools/xngen
-//
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <wasm_simd128.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__wasmsimd_x86_x8(
- size_t n,
- const float* x,
- float* y,
- const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
- assert(n != 0);
- assert(n % sizeof(float) == 0);
-
- const v128_t vsixth = wasm_v32x4_load_splat(¶ms->scalar.sixth);
- const v128_t vhalf = wasm_v32x4_load_splat(¶ms->scalar.half);
- const v128_t vone = wasm_v32x4_load_splat(¶ms->scalar.one);
- const v128_t vzero = wasm_f32x4_splat(0.0f);
-
- for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
- const v128_t vx0123 = wasm_v128_load(x);
- const v128_t vx4567 = wasm_v128_load(x + 4);
- x += 8;
-
- v128_t vacc0123 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx0123, vsixth));
- v128_t vacc4567 = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx4567, vsixth));
-
- const v128_t vmasklt0123 = wasm_f32x4_lt(vacc0123, vzero);
- vacc0123 = wasm_v128_andnot(vacc0123, vmasklt0123);
- const v128_t vmasklt4567 = wasm_f32x4_lt(vacc4567, vzero);
- vacc4567 = wasm_v128_andnot(vacc4567, vmasklt4567);
-
- const v128_t vmaskge0123 = wasm_f32x4_ge(vacc0123, vone);
- vacc0123 = wasm_f32x4_mul(vacc0123, vx0123);
- const v128_t vmaskge4567 = wasm_f32x4_ge(vacc4567, vone);
- vacc4567 = wasm_f32x4_mul(vacc4567, vx4567);
-
- vacc0123 = wasm_v128_bitselect(vx0123, vacc0123, vmaskge0123);
- vacc4567 = wasm_v128_bitselect(vx4567, vacc4567, vmaskge4567);
-
- wasm_v128_store(y, vacc0123);
- wasm_v128_store(y + 4, vacc4567);
- y += 8;
- }
- for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const v128_t vx = wasm_v128_load(x);
- x += 4;
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
- const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
- vacc = wasm_v128_andnot(vacc, vmasklt);
- const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
- vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-
- wasm_v128_store(y, vacc);
- y += 4;
- }
- if XNN_UNLIKELY(n != 0) {
- const v128_t vx = wasm_v128_load(x);
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
-
- const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
- vacc = wasm_v128_andnot(vacc, vmasklt);
- const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
- vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
-
- if (n & (2 * sizeof(float))) {
- *((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
- vacc = wasm_v32x4_shuffle(vacc, vacc, 2, 3, 2, 3);
- y += 2;
- }
- if (n & (1 * sizeof(float))) {
- *y = wasm_f32x4_extract_lane(vacc, 0);
- }
- }
-}
diff --git a/src/f32-hswish/neon.c.in b/src/f32-hswish/neon.c.in
index f8af0f7..61386b2 100644
--- a/src/f32-hswish/neon.c.in
+++ b/src/f32-hswish/neon.c.in
@@ -14,7 +14,7 @@
#include <xnnpack/hswish.h>
-void xnn_f32_hswish_ukernel__${"neonfma" if FMA else "neon"}_x${BATCH_TILE}(
+void xnn_f32_hswish_ukernel__neon_x${BATCH_TILE}(
size_t n,
const float* x,
float* y,
@@ -24,61 +24,55 @@
assert(n % sizeof(float) == 0);
const float32x4_t vsixth = vld1q_dup_f32(¶ms->scalar.sixth);
- const float32x4_t vhalf = vld1q_dup_f32(¶ms->scalar.half);
- const float32x4_t vone = vld1q_dup_f32(¶ms->scalar.one);
- const float32x4_t vzero = vdupq_n_f32(0.0f);
+ const float32x4_t vthree = vld1q_dup_f32(¶ms->scalar.three);
+ const int32x4_t vsix = vreinterpretq_s32_f32(vld1q_dup_f32(¶ms->scalar.six));
+ const int32x4_t vzero = vdupq_n_s32(0);
- for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
- $for N in range(0, BATCH_TILE, 4):
- const float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4;
-
- $for N in range(0, BATCH_TILE, 4):
- $if FMA:
- float32x4_t vacc${ABC[N:N+4]} = vfmaq_f32(vhalf, vx${ABC[N:N+4]}, vsixth);
- $else:
- float32x4_t vacc${ABC[N:N+4]} = vmlaq_f32(vhalf, vx${ABC[N:N+4]}, vsixth);
-
- $for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = vmaxq_f32(vacc${ABC[N:N+4]}, vzero);
-
- $for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = vminq_f32(vacc${ABC[N:N+4]}, vone);
-
- $for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = vmulq_f32(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
-
- $for N in range(0, BATCH_TILE, 4):
- vst1q_f32(y, vacc${ABC[N:N+4]}); y += 4;
- }
$if BATCH_TILE > 4:
- for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const float32x4_t vx0123 = vld1q_f32(x); x += 4;
- $if FMA:
- float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
- $else:
- float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
- vacc0123 = vmaxq_f32(vacc0123, vzero);
- vacc0123 = vminq_f32(vacc0123, vone);
- vacc0123 = vmulq_f32(vacc0123, vx0123);
- vst1q_f32(y, vacc0123); y += 4;
- }
- if XNN_UNLIKELY(n != 0) {
- const float32x4_t vx0123 = vld1q_f32(x);
- $if FMA:
- float32x4_t vacc0123 = vfmaq_f32(vhalf, vx0123, vsixth);
- $else:
- float32x4_t vacc0123 = vmlaq_f32(vhalf, vx0123, vsixth);
- vacc0123 = vmaxq_f32(vacc0123, vzero);
- vacc0123 = vminq_f32(vacc0123, vone);
- vacc0123 = vmulq_f32(vacc0123, vx0123);
+ for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
+ $for N in range(0, BATCH_TILE, 4):
+ float32x4_t vx${ABC[N:N+4]} = vld1q_f32(x); x += 4;
- float32x2_t vacc01 = vget_low_f32(vacc0123);
+ $for N in range(0, BATCH_TILE, 4):
+ float32x4_t vacc${ABC[N:N+4]} = vaddq_f32(vx${ABC[N:N+4]}, vthree);
+ vx${ABC[N:N+4]} = vmulq_f32(vx${ABC[N:N+4]}, vsixth);
+
+ $for N in range(0, BATCH_TILE, 4):
+ vacc${ABC[N:N+4]} = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc${ABC[N:N+4]}), vzero));
+
+ $for N in range(0, BATCH_TILE, 4):
+ vacc${ABC[N:N+4]} = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc${ABC[N:N+4]}), vsix));
+
+ $for N in range(0, BATCH_TILE, 4):
+ vacc${ABC[N:N+4]} = vmulq_f32(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
+
+ $for N in range(0, BATCH_TILE, 4):
+ vst1q_f32(y, vacc${ABC[N:N+4]}); y += 4;
+ }
+ for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+ float32x4_t vx = vld1q_f32(x); x += 4;
+ float32x4_t vacc = vaddq_f32(vx, vthree);
+ vx = vmulq_f32(vx, vsixth);
+ vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
+ vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
+ vacc = vmulq_f32(vacc, vx);
+ vst1q_f32(y, vacc); y += 4;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ float32x4_t vx = vld1q_f32(x);
+ float32x4_t vacc = vaddq_f32(vx, vthree);
+ vx = vmulq_f32(vx, vsixth);
+ vacc = vreinterpretq_f32_s32(vmaxq_s32(vreinterpretq_s32_f32(vacc), vzero));
+ vacc = vreinterpretq_f32_s32(vminq_s32(vreinterpretq_s32_f32(vacc), vsix));
+ vacc = vmulq_f32(vacc, vx);
+
+ float32x2_t vacc_lo = vget_low_f32(vacc);
if (n & (2 * sizeof(float))) {
- vst1_f32(y, vacc01); y += 2;
- vacc01 = vget_high_f32(vacc0123);
+ vst1_f32(y, vacc_lo); y += 2;
+ vacc_lo = vget_high_f32(vacc);
}
if (n & (1 * sizeof(float))) {
- vst1_lane_f32(y, vacc01, 0);
+ vst1_lane_f32(y, vacc_lo, 0);
}
}
}
diff --git a/src/f32-hswish/psimd.c.in b/src/f32-hswish/psimd.c.in
deleted file mode 100644
index 2f71c29..0000000
--- a/src/f32-hswish/psimd.c.in
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert BATCH_TILE % 4 == 0
-$assert BATCH_TILE >= 4
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include <psimd.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/hswish.h>
-
-
-void xnn_f32_hswish_ukernel__psimd_x${BATCH_TILE}(
- size_t n,
- const float* x,
- float* y,
- const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
- assert(n != 0);
- assert(n % sizeof(float) == 0);
-
- const psimd_f32 vsixth = psimd_load_splat_f32(¶ms->scalar.sixth);
- const psimd_f32 vhalf = psimd_load_splat_f32(¶ms->scalar.half);
- const psimd_f32 vone = psimd_load_splat_f32(¶ms->scalar.one);
- const psimd_f32 vzero = psimd_splat_f32(0.0f);
-
- for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
- const psimd_f32 vx${ABC[0:4]} = psimd_load_f32(x);
- $for N in range(4, BATCH_TILE, 4):
- const psimd_f32 vx${ABC[N:N+4]} = psimd_load_f32(x + ${N});
- x += ${BATCH_TILE};
-
- $for N in range(0, BATCH_TILE, 4):
- psimd_f32 vacc${ABC[N:N+4]} = psimd_qfma_f32(vhalf, vx${ABC[N:N+4]}, vsixth);
-
- $for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = psimd_max_f32(vacc${ABC[N:N+4]}, vzero);
-
- $for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = psimd_min_f32(vacc${ABC[N:N+4]}, vone);
-
- $for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = psimd_mul_f32(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
-
- psimd_store_f32(y, vacc${ABC[0:4]});
- $for N in range(4, BATCH_TILE, 4):
- psimd_store_f32(y + ${N}, vacc${ABC[N:N+4]});
- y += ${BATCH_TILE};
- }
- $if BATCH_TILE > 4:
- for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const psimd_f32 vx0123 = psimd_load_f32(x);
- x += 4;
- psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
- vacc0123 = psimd_max_f32(vacc0123, vzero);
- vacc0123 = psimd_min_f32(vacc0123, vone);
- vacc0123 = psimd_mul_f32(vacc0123, vx0123);
- psimd_store_f32(y, vacc0123);
- y += 4;
- }
- if XNN_UNLIKELY(n != 0) {
- const psimd_f32 vx0123 = psimd_load_f32(x);
- psimd_f32 vacc0123 = psimd_qfma_f32(vhalf, vx0123, vsixth);
- vacc0123 = psimd_max_f32(vacc0123, vzero);
- vacc0123 = psimd_min_f32(vacc0123, vone);
- vacc0123 = psimd_mul_f32(vacc0123, vx0123);
-
- if (n & (2 * sizeof(float))) {
- psimd_store2_f32(y, vacc0123);
- vacc0123 = psimd_concat_hi_f32(vacc0123, vacc0123);
- y += 2;
- }
- if (n & (1 * sizeof(float))) {
- psimd_store1_f32(y, vacc0123);
- }
- }
-}
diff --git a/src/f32-hswish/scalar.c.in b/src/f32-hswish/scalar.c.in
index fd9c00b..fa26801 100644
--- a/src/f32-hswish/scalar.c.in
+++ b/src/f32-hswish/scalar.c.in
@@ -24,25 +24,27 @@
assert(n % sizeof(float) == 0);
const float vsixth = params->scalar.sixth;
- const float vhalf = params->scalar.half;
- const float vone = params->scalar.one;
- assert(vhalf == 0.5f);
- assert(vone == 1.0f);
+ const float vthree = params->scalar.three;
+ const float vsix = params->scalar.six;
+ const float vzero = 0.0f;
+ assert(vthree == 3.0f);
+ assert(vsix == 6.0f);
$if BATCH_TILE > 1:
for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
$for N in range(BATCH_TILE):
- const float vx${ABC[N]} = x[${N}];
+ float vx${ABC[N]} = x[${N}];
x += ${BATCH_TILE};
$for N in range(BATCH_TILE):
- float vacc${ABC[N]} = vx${ABC[N]} * vsixth + vhalf;
+ float vacc${ABC[N]} = vx${ABC[N]} + vthree;
+ vx${ABC[N]} *= vsixth;
$for N in range(BATCH_TILE):
- vacc${ABC[N]} = ${MAX_F32}(vacc${ABC[N]}, 0.0f);
+ vacc${ABC[N]} = ${MAX_F32}(vacc${ABC[N]}, vzero);
$for N in range(BATCH_TILE):
- vacc${ABC[N]} = ${MIN_F32}(vacc${ABC[N]}, vone);
+ vacc${ABC[N]} = ${MIN_F32}(vacc${ABC[N]}, vsix);
$for N in range(BATCH_TILE):
vacc${ABC[N]} *= vx${ABC[N]};
@@ -54,29 +56,32 @@
if XNN_UNLIKELY(n != 0) {
$if BATCH_TILE > 2:
do {
- const float vx = *x++;
- float vacc = vx * vsixth + vhalf;
- vacc = ${MAX_F32}(vacc, 0.0f);
- vacc = ${MIN_F32}(vacc, vone);
- vacc = vacc * vx;
+ float vx = *x++;
+ float vacc = vx + vthree;
+ vx *= vsixth;
+ vacc = ${MAX_F32}(vacc, vzero);
+ vacc = ${MIN_F32}(vacc, vsix);
+ vacc *= vx;
*y++ = vacc;
n -= sizeof(float);
} while (n != 0);
$else:
- const float vx = *x;
- float vacc = vx * vsixth + vhalf;
- vacc = ${MAX_F32}(vacc, 0.0f);
- vacc = ${MIN_F32}(vacc, vone);
- vacc = vacc * vx;
+ float vx = *x;
+ float vacc = vx + vthree;
+ vx *= vsixth;
+ vacc = ${MAX_F32}(vacc, vzero);
+ vacc = ${MIN_F32}(vacc, vsix);
+ vacc *= vx;
*y = vacc;
}
$else:
for (; n >= sizeof(float); n -= sizeof(float)) {
- const float vx = *x++;
- float vacc = vx * vsixth + vhalf;
- vacc = ${MAX_F32}(vacc, 0.0f);
- vacc = ${MIN_F32}(vacc, vone);
- vacc = vacc * vx;
+ float vx = *x++;
+ float vacc = vx + vthree;
+ vx *= vsixth;
+ vacc = ${MAX_F32}(vacc, vzero);
+ vacc = ${MIN_F32}(vacc, vsix);
+ vacc *= vx;
*y++ = vacc;
}
}
diff --git a/src/f32-hswish/wasmsimd.c.in b/src/f32-hswish/wasmsimd.c.in
index b8405d1..5854b26 100644
--- a/src/f32-hswish/wasmsimd.c.in
+++ b/src/f32-hswish/wasmsimd.c.in
@@ -14,7 +14,7 @@
#include <xnnpack/hswish.h>
-void xnn_f32_hswish_ukernel__wasmsimd_${"x86" if X86 else "arm"}_x${BATCH_TILE}(
+void xnn_f32_hswish_ukernel__wasmsimd_x${BATCH_TILE}(
size_t n,
const float* x,
float* y,
@@ -24,40 +24,29 @@
assert(n % sizeof(float) == 0);
const v128_t vsixth = wasm_v32x4_load_splat(¶ms->scalar.sixth);
- const v128_t vhalf = wasm_v32x4_load_splat(¶ms->scalar.half);
- const v128_t vone = wasm_v32x4_load_splat(¶ms->scalar.one);
+ const v128_t vthree = wasm_v32x4_load_splat(¶ms->scalar.three);
+ const v128_t vsix = wasm_v32x4_load_splat(¶ms->scalar.six);
const v128_t vzero = wasm_f32x4_splat(0.0f);
$if BATCH_TILE > 4:
for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
- const v128_t vx${ABC[0:4]} = wasm_v128_load(x);
+ v128_t vx${ABC[0:4]} = wasm_v128_load(x);
$for N in range(4, BATCH_TILE, 4):
- const v128_t vx${ABC[N:N+4]} = wasm_v128_load(x + ${N});
+ v128_t vx${ABC[N:N+4]} = wasm_v128_load(x + ${N});
x += ${BATCH_TILE};
$for N in range(0, BATCH_TILE, 4):
- v128_t vacc${ABC[N:N+4]} = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx${ABC[N:N+4]}, vsixth));
+ v128_t vacc${ABC[N:N+4]} = wasm_f32x4_add(vx${ABC[N:N+4]}, vthree);
+ vx${ABC[N:N+4]} = wasm_f32x4_mul(vx${ABC[N:N+4]}, vsixth);
- $if X86:
- $for N in range(0, BATCH_TILE, 4):
- const v128_t vmasklt${ABC[N:N+4]} = wasm_f32x4_lt(vacc${ABC[N:N+4]}, vzero);
- vacc${ABC[N:N+4]} = wasm_v128_andnot(vacc${ABC[N:N+4]}, vmasklt${ABC[N:N+4]});
+ $for N in range(0, BATCH_TILE, 4):
+ vacc${ABC[N:N+4]} = wasm_i32x4_max(vacc${ABC[N:N+4]}, vzero);
- $for N in range(0, BATCH_TILE, 4):
- const v128_t vmaskge${ABC[N:N+4]} = wasm_f32x4_ge(vacc${ABC[N:N+4]}, vone);
- vacc${ABC[N:N+4]} = wasm_f32x4_mul(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
+ $for N in range(0, BATCH_TILE, 4):
+ vacc${ABC[N:N+4]} = wasm_i32x4_min(vacc${ABC[N:N+4]}, vsix);
- $for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = wasm_v128_bitselect(vx${ABC[N:N+4]}, vacc${ABC[N:N+4]}, vmaskge${ABC[N:N+4]});
- $else:
- $for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = wasm_f32x4_max(vacc${ABC[N:N+4]}, vzero);
-
- $for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = wasm_f32x4_min(vacc${ABC[N:N+4]}, vone);
-
- $for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = wasm_f32x4_mul(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
+ $for N in range(0, BATCH_TILE, 4):
+ vacc${ABC[N:N+4]} = wasm_f32x4_mul(vacc${ABC[N:N+4]}, vx${ABC[N:N+4]});
wasm_v128_store(y, vacc${ABC[0:4]});
$for N in range(4, BATCH_TILE, 4):
@@ -65,38 +54,26 @@
y += ${BATCH_TILE};
}
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
- const v128_t vx = wasm_v128_load(x);
+ v128_t vx = wasm_v128_load(x);
x += 4;
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
- $if X86:
- const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
- vacc = wasm_v128_andnot(vacc, vmasklt);
- const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
- vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
- $else:
- vacc = wasm_f32x4_max(vacc, vzero);
- vacc = wasm_f32x4_min(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
+ v128_t vacc = wasm_f32x4_add(vx, vthree);
+ vx = wasm_f32x4_mul(vx, vsixth);
+ vacc = wasm_i32x4_max(vacc, vzero);
+ vacc = wasm_i32x4_min(vacc, vsix);
+ vacc = wasm_f32x4_mul(vacc, vx);
wasm_v128_store(y, vacc);
y += 4;
}
if XNN_UNLIKELY(n != 0) {
- const v128_t vx = wasm_v128_load(x);
- v128_t vacc = wasm_f32x4_add(vhalf, wasm_f32x4_mul(vx, vsixth));
+ v128_t vx = wasm_v128_load(x);
- $if X86:
- const v128_t vmasklt = wasm_f32x4_lt(vacc, vzero);
- vacc = wasm_v128_andnot(vacc, vmasklt);
- const v128_t vmaskge = wasm_f32x4_ge(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
- vacc = wasm_v128_bitselect(vx, vacc, vmaskge);
- $else:
- vacc = wasm_f32x4_max(vacc, vzero);
- vacc = wasm_f32x4_min(vacc, vone);
- vacc = wasm_f32x4_mul(vacc, vx);
+ v128_t vacc = wasm_f32x4_add(vx, vthree);
+ vx = wasm_f32x4_mul(vx, vsixth);
+ vacc = wasm_i32x4_max(vacc, vzero);
+ vacc = wasm_i32x4_min(vacc, vsix);
+ vacc = wasm_f32x4_mul(vacc, vx);
if (n & (2 * sizeof(float))) {
*((double*) y) = wasm_f64x2_extract_lane(vacc, 0);
diff --git a/src/init.c b/src/init.c
index 5026d87..3001112 100644
--- a/src/init.c
+++ b/src/init.c
@@ -1016,7 +1016,7 @@
};
xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__neon_x8;
xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__neon_x8;
- xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neonfma_x8;
+ xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__neon_x8;
xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__neon_x8;
xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__neon_x8;
xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__neonv8_x8;
@@ -1835,11 +1835,10 @@
xnn_params.f32.abs = (xnn_univector_ukernel_function) xnn_f32_vabs_ukernel__wasmsimd_x8;
if (is_wasm_x86) {
xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasmsimd_x86_x8;
- xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasmsimd_x86_x16;
} else {
xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasmsimd_arm_x8;
- xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasmsimd_arm_x8;
}
+ xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasmsimd_x16;
xnn_params.f32.lrelu = (xnn_univector_ukernel_function) xnn_f32_vlrelu_ukernel__psimd_x8;
xnn_params.f32.neg = (xnn_univector_ukernel_function) xnn_f32_vneg_ukernel__wasmsimd_x8;
xnn_params.f32.rndne = (xnn_univector_ukernel_function) xnn_f32_vrndne_ukernel__wasmsimd_x8;
diff --git a/src/xnnpack/hswish.h b/src/xnnpack/hswish.h
index 571efb7..130c737 100644
--- a/src/xnnpack/hswish.h
+++ b/src/xnnpack/hswish.h
@@ -36,9 +36,6 @@
DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neon_x4)
DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neon_x8)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neonfma_x4)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neonfma_x8)
-
DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__sse_x4)
DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__sse_x8)
@@ -51,15 +48,9 @@
DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__avx512f_x16)
DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__avx512f_x32)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__psimd_x4)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__psimd_x8)
-
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_arm_x4)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_arm_x8)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_arm_x16)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_x86_x4)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_x86_x8)
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_x86_x16)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_x4)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_x8)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasmsimd_x16)
DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasm_x1)
DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasm_x2)
diff --git a/src/xnnpack/params-init.h b/src/xnnpack/params-init.h
index 9b74880..4c2f582 100644
--- a/src/xnnpack/params-init.h
+++ b/src/xnnpack/params-init.h
@@ -466,8 +466,8 @@
}
#else
params.scalar.sixth = 0x1.555556p-3f;
- params.scalar.half = 0.5f;
- params.scalar.one = 1.0f;
+ params.scalar.three = 3.0f;
+ params.scalar.six = 6.0f;
#endif
return params;
}
@@ -476,8 +476,8 @@
{
union xnn_f32_hswish_params params;
params.scalar.sixth = 0x1.555556p-3f;
- params.scalar.half = 0.5f;
- params.scalar.one = 1.0f;
+ params.scalar.three = 3.0f;
+ params.scalar.six = 6.0f;
return params;
}
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index c0236d5..7099158 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -209,8 +209,8 @@
union xnn_f32_hswish_params {
struct {
float sixth;
- float half;
- float one;
+ float three;
+ float six;
} scalar;
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
struct {
diff --git a/test/f32-hswish.cc b/test/f32-hswish.cc
index 7a5aeac..9b26996 100644
--- a/test/f32-hswish.cc
+++ b/test/f32-hswish.cc
@@ -111,100 +111,6 @@
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
- TEST(F32_HSWISH__NEONFMA_X4, batch_eq_4) {
- TEST_REQUIRES_ARM_NEON_FMA;
- HSwishMicrokernelTester()
- .batch_size(4)
- .Test(xnn_f32_hswish_ukernel__neonfma_x4);
- }
-
- TEST(F32_HSWISH__NEONFMA_X4, batch_div_4) {
- TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__neonfma_x4);
- }
- }
-
- TEST(F32_HSWISH__NEONFMA_X4, batch_lt_4) {
- TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t batch_size = 1; batch_size < 4; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__neonfma_x4);
- }
- }
-
- TEST(F32_HSWISH__NEONFMA_X4, batch_gt_4) {
- TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t batch_size = 5; batch_size < 8; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__neonfma_x4);
- }
- }
-
- TEST(F32_HSWISH__NEONFMA_X4, inplace) {
- TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .inplace(true)
- .Test(xnn_f32_hswish_ukernel__neonfma_x4);
- }
- }
-#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
-#if XNN_ARCH_ARM || XNN_ARCH_ARM64
- TEST(F32_HSWISH__NEONFMA_X8, batch_eq_8) {
- TEST_REQUIRES_ARM_NEON_FMA;
- HSwishMicrokernelTester()
- .batch_size(8)
- .Test(xnn_f32_hswish_ukernel__neonfma_x8);
- }
-
- TEST(F32_HSWISH__NEONFMA_X8, batch_div_8) {
- TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__neonfma_x8);
- }
- }
-
- TEST(F32_HSWISH__NEONFMA_X8, batch_lt_8) {
- TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t batch_size = 1; batch_size < 8; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__neonfma_x8);
- }
- }
-
- TEST(F32_HSWISH__NEONFMA_X8, batch_gt_8) {
- TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t batch_size = 9; batch_size < 16; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__neonfma_x8);
- }
- }
-
- TEST(F32_HSWISH__NEONFMA_X8, inplace) {
- TEST_REQUIRES_ARM_NEON_FMA;
- for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .inplace(true)
- .Test(xnn_f32_hswish_ukernel__neonfma_x8);
- }
- }
-#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
-
-
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(F32_HSWISH__SSE_X4, batch_eq_4) {
TEST_REQUIRES_X86_SSE;
@@ -581,347 +487,127 @@
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_HSWISH__PSIMD_X4, batch_eq_4) {
- TEST_REQUIRES_PSIMD;
- HSwishMicrokernelTester()
- .batch_size(4)
- .Test(xnn_f32_hswish_ukernel__psimd_x4, HSwishMicrokernelTester::Variant::Scalar);
- }
-
- TEST(F32_HSWISH__PSIMD_X4, batch_div_4) {
- TEST_REQUIRES_PSIMD;
- for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__psimd_x4, HSwishMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(F32_HSWISH__PSIMD_X4, batch_lt_4) {
- TEST_REQUIRES_PSIMD;
- for (size_t batch_size = 1; batch_size < 4; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__psimd_x4, HSwishMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(F32_HSWISH__PSIMD_X4, batch_gt_4) {
- TEST_REQUIRES_PSIMD;
- for (size_t batch_size = 5; batch_size < 8; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__psimd_x4, HSwishMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(F32_HSWISH__PSIMD_X4, inplace) {
- TEST_REQUIRES_PSIMD;
- for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .inplace(true)
- .Test(xnn_f32_hswish_ukernel__psimd_x4, HSwishMicrokernelTester::Variant::Scalar);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
-#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
- TEST(F32_HSWISH__PSIMD_X8, batch_eq_8) {
- TEST_REQUIRES_PSIMD;
- HSwishMicrokernelTester()
- .batch_size(8)
- .Test(xnn_f32_hswish_ukernel__psimd_x8, HSwishMicrokernelTester::Variant::Scalar);
- }
-
- TEST(F32_HSWISH__PSIMD_X8, batch_div_8) {
- TEST_REQUIRES_PSIMD;
- for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__psimd_x8, HSwishMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(F32_HSWISH__PSIMD_X8, batch_lt_8) {
- TEST_REQUIRES_PSIMD;
- for (size_t batch_size = 1; batch_size < 8; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__psimd_x8, HSwishMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(F32_HSWISH__PSIMD_X8, batch_gt_8) {
- TEST_REQUIRES_PSIMD;
- for (size_t batch_size = 9; batch_size < 16; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__psimd_x8, HSwishMicrokernelTester::Variant::Scalar);
- }
- }
-
- TEST(F32_HSWISH__PSIMD_X8, inplace) {
- TEST_REQUIRES_PSIMD;
- for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .inplace(true)
- .Test(xnn_f32_hswish_ukernel__psimd_x8, HSwishMicrokernelTester::Variant::Scalar);
- }
- }
-#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
-
-
#if XNN_ARCH_WASMSIMD
- TEST(F32_HSWISH__WASMSIMD_ARM_X4, batch_eq_4) {
+ TEST(F32_HSWISH__WASMSIMD_X4, batch_eq_4) {
HSwishMicrokernelTester()
.batch_size(4)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x4);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x4);
}
- TEST(F32_HSWISH__WASMSIMD_ARM_X4, batch_div_4) {
+ TEST(F32_HSWISH__WASMSIMD_X4, batch_div_4) {
for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
HSwishMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x4);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x4);
}
}
- TEST(F32_HSWISH__WASMSIMD_ARM_X4, batch_lt_4) {
+ TEST(F32_HSWISH__WASMSIMD_X4, batch_lt_4) {
for (size_t batch_size = 1; batch_size < 4; batch_size++) {
HSwishMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x4);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x4);
}
}
- TEST(F32_HSWISH__WASMSIMD_ARM_X4, batch_gt_4) {
+ TEST(F32_HSWISH__WASMSIMD_X4, batch_gt_4) {
for (size_t batch_size = 5; batch_size < 8; batch_size++) {
HSwishMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x4);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x4);
}
}
- TEST(F32_HSWISH__WASMSIMD_ARM_X4, inplace) {
+ TEST(F32_HSWISH__WASMSIMD_X4, inplace) {
for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
HSwishMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x4);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x4);
}
}
#endif // XNN_ARCH_WASMSIMD
#if XNN_ARCH_WASMSIMD
- TEST(F32_HSWISH__WASMSIMD_ARM_X8, batch_eq_8) {
+ TEST(F32_HSWISH__WASMSIMD_X8, batch_eq_8) {
HSwishMicrokernelTester()
.batch_size(8)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x8);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x8);
}
- TEST(F32_HSWISH__WASMSIMD_ARM_X8, batch_div_8) {
+ TEST(F32_HSWISH__WASMSIMD_X8, batch_div_8) {
for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
HSwishMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x8);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x8);
}
}
- TEST(F32_HSWISH__WASMSIMD_ARM_X8, batch_lt_8) {
+ TEST(F32_HSWISH__WASMSIMD_X8, batch_lt_8) {
for (size_t batch_size = 1; batch_size < 8; batch_size++) {
HSwishMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x8);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x8);
}
}
- TEST(F32_HSWISH__WASMSIMD_ARM_X8, batch_gt_8) {
+ TEST(F32_HSWISH__WASMSIMD_X8, batch_gt_8) {
for (size_t batch_size = 9; batch_size < 16; batch_size++) {
HSwishMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x8);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x8);
}
}
- TEST(F32_HSWISH__WASMSIMD_ARM_X8, inplace) {
+ TEST(F32_HSWISH__WASMSIMD_X8, inplace) {
for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
HSwishMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x8);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x8);
}
}
#endif // XNN_ARCH_WASMSIMD
#if XNN_ARCH_WASMSIMD
- TEST(F32_HSWISH__WASMSIMD_ARM_X16, batch_eq_16) {
+ TEST(F32_HSWISH__WASMSIMD_X16, batch_eq_16) {
HSwishMicrokernelTester()
.batch_size(16)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x16);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x16);
}
- TEST(F32_HSWISH__WASMSIMD_ARM_X16, batch_div_16) {
+ TEST(F32_HSWISH__WASMSIMD_X16, batch_div_16) {
for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
HSwishMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x16);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x16);
}
}
- TEST(F32_HSWISH__WASMSIMD_ARM_X16, batch_lt_16) {
+ TEST(F32_HSWISH__WASMSIMD_X16, batch_lt_16) {
for (size_t batch_size = 1; batch_size < 16; batch_size++) {
HSwishMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x16);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x16);
}
}
- TEST(F32_HSWISH__WASMSIMD_ARM_X16, batch_gt_16) {
+ TEST(F32_HSWISH__WASMSIMD_X16, batch_gt_16) {
for (size_t batch_size = 17; batch_size < 32; batch_size++) {
HSwishMicrokernelTester()
.batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x16);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x16);
}
}
- TEST(F32_HSWISH__WASMSIMD_ARM_X16, inplace) {
+ TEST(F32_HSWISH__WASMSIMD_X16, inplace) {
for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
HSwishMicrokernelTester()
.batch_size(batch_size)
.inplace(true)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_arm_x16);
- }
- }
-#endif // XNN_ARCH_WASMSIMD
-
-
-#if XNN_ARCH_WASMSIMD
- TEST(F32_HSWISH__WASMSIMD_X86_X4, batch_eq_4) {
- HSwishMicrokernelTester()
- .batch_size(4)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x4);
- }
-
- TEST(F32_HSWISH__WASMSIMD_X86_X4, batch_div_4) {
- for (size_t batch_size = 8; batch_size < 40; batch_size += 4) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x4);
- }
- }
-
- TEST(F32_HSWISH__WASMSIMD_X86_X4, batch_lt_4) {
- for (size_t batch_size = 1; batch_size < 4; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x4);
- }
- }
-
- TEST(F32_HSWISH__WASMSIMD_X86_X4, batch_gt_4) {
- for (size_t batch_size = 5; batch_size < 8; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x4);
- }
- }
-
- TEST(F32_HSWISH__WASMSIMD_X86_X4, inplace) {
- for (size_t batch_size = 1; batch_size <= 20; batch_size += 3) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .inplace(true)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x4);
- }
- }
-#endif // XNN_ARCH_WASMSIMD
-
-
-#if XNN_ARCH_WASMSIMD
- TEST(F32_HSWISH__WASMSIMD_X86_X8, batch_eq_8) {
- HSwishMicrokernelTester()
- .batch_size(8)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x8);
- }
-
- TEST(F32_HSWISH__WASMSIMD_X86_X8, batch_div_8) {
- for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x8);
- }
- }
-
- TEST(F32_HSWISH__WASMSIMD_X86_X8, batch_lt_8) {
- for (size_t batch_size = 1; batch_size < 8; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x8);
- }
- }
-
- TEST(F32_HSWISH__WASMSIMD_X86_X8, batch_gt_8) {
- for (size_t batch_size = 9; batch_size < 16; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x8);
- }
- }
-
- TEST(F32_HSWISH__WASMSIMD_X86_X8, inplace) {
- for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .inplace(true)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x8);
- }
- }
-#endif // XNN_ARCH_WASMSIMD
-
-
-#if XNN_ARCH_WASMSIMD
- TEST(F32_HSWISH__WASMSIMD_X86_X16, batch_eq_16) {
- HSwishMicrokernelTester()
- .batch_size(16)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x16);
- }
-
- TEST(F32_HSWISH__WASMSIMD_X86_X16, batch_div_16) {
- for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x16);
- }
- }
-
- TEST(F32_HSWISH__WASMSIMD_X86_X16, batch_lt_16) {
- for (size_t batch_size = 1; batch_size < 16; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x16);
- }
- }
-
- TEST(F32_HSWISH__WASMSIMD_X86_X16, batch_gt_16) {
- for (size_t batch_size = 17; batch_size < 32; batch_size++) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x16);
- }
- }
-
- TEST(F32_HSWISH__WASMSIMD_X86_X16, inplace) {
- for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
- HSwishMicrokernelTester()
- .batch_size(batch_size)
- .inplace(true)
- .Test(xnn_f32_hswish_ukernel__wasmsimd_x86_x16);
+ .Test(xnn_f32_hswish_ukernel__wasmsimd_x16);
}
}
#endif // XNN_ARCH_WASMSIMD
diff --git a/test/f32-hswish.yaml b/test/f32-hswish.yaml
index 1c33240..781537e 100644
--- a/test/f32-hswish.yaml
+++ b/test/f32-hswish.yaml
@@ -4,8 +4,6 @@
# LICENSE file in the root directory of this source tree.
- name: xnn_f32_hswish_ukernel__neon_x4
- name: xnn_f32_hswish_ukernel__neon_x8
-- name: xnn_f32_hswish_ukernel__neonfma_x4
-- name: xnn_f32_hswish_ukernel__neonfma_x8
- name: xnn_f32_hswish_ukernel__sse_x4
- name: xnn_f32_hswish_ukernel__sse_x8
- name: xnn_f32_hswish_ukernel__avx_x8
@@ -14,14 +12,9 @@
- name: xnn_f32_hswish_ukernel__fma3_x16
- name: xnn_f32_hswish_ukernel__avx512f_x16
- name: xnn_f32_hswish_ukernel__avx512f_x32
-- name: xnn_f32_hswish_ukernel__psimd_x4
-- name: xnn_f32_hswish_ukernel__psimd_x8
-- name: xnn_f32_hswish_ukernel__wasmsimd_arm_x4
-- name: xnn_f32_hswish_ukernel__wasmsimd_arm_x8
-- name: xnn_f32_hswish_ukernel__wasmsimd_arm_x16
-- name: xnn_f32_hswish_ukernel__wasmsimd_x86_x4
-- name: xnn_f32_hswish_ukernel__wasmsimd_x86_x8
-- name: xnn_f32_hswish_ukernel__wasmsimd_x86_x16
+- name: xnn_f32_hswish_ukernel__wasmsimd_x4
+- name: xnn_f32_hswish_ukernel__wasmsimd_x8
+- name: xnn_f32_hswish_ukernel__wasmsimd_x16
- name: xnn_f32_hswish_ukernel__wasm_x1
- name: xnn_f32_hswish_ukernel__wasm_x2
- name: xnn_f32_hswish_ukernel__wasm_x4