Add F32 VLRELU benchmarks
- Base on F32 VRELU benchmarks
- Use benchmark::utils::UnaryElementwiseParameters
PiperOrigin-RevId: 389412604
diff --git a/BUILD.bazel b/BUILD.bazel
index acec396..16ff5e6 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -7566,6 +7566,15 @@
)
xnnpack_benchmark(
+ name = "f32_vlrelu_bench",
+ srcs = [
+ "bench/f32-vlrelu.cc",
+ "src/xnnpack/AlignedAllocator.h",
+ ] + MICROKERNEL_BENCHMARK_HDRS,
+ deps = MICROKERNEL_BENCHMARK_DEPS,
+)
+
+xnnpack_benchmark(
name = "f32_vrelu_bench",
srcs = [
"bench/f32-vrelu.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cfa425e..df0e48c 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7234,6 +7234,14 @@
TARGET_INCLUDE_DIRECTORIES(f32-vhswish-bench PRIVATE . include src)
TARGET_LINK_LIBRARIES(f32-vhswish-bench PRIVATE benchmark bench-utils cpuinfo fp16 pthreadpool)
+ ADD_EXECUTABLE(f32-vlrelu-bench bench/f32-vlrelu.cc $<TARGET_OBJECTS:all_microkernels>)
+ SET_TARGET_PROPERTIES(f32-vlrelu-bench PROPERTIES
+ CXX_STANDARD 11
+ CXX_STANDARD_REQUIRED YES
+ CXX_EXTENSIONS YES)
+ TARGET_INCLUDE_DIRECTORIES(f32-vlrelu-bench PRIVATE . include src)
+ TARGET_LINK_LIBRARIES(f32-vlrelu-bench PRIVATE benchmark bench-utils cpuinfo fp16 pthreadpool)
+
ADD_EXECUTABLE(f32-vrelu-bench bench/f32-vrelu.cc $<TARGET_OBJECTS:all_microkernels>)
SET_TARGET_PROPERTIES(f32-vrelu-bench PROPERTIES
CXX_STANDARD 11
diff --git a/bench/f32-vlrelu.cc b/bench/f32-vlrelu.cc
new file mode 100644
index 0000000..b0c84f3
--- /dev/null
+++ b/bench/f32-vlrelu.cc
@@ -0,0 +1,171 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+#include "bench/utils.h"
+
+#include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/vunary.h>
+#include <xnnpack/params.h>
+#include <xnnpack/params-init.h>
+
+
+static void f32_vlrelu(
+ benchmark::State& state,
+ xnn_f32_vlrelu_ukernel_function vlrelu,
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+ if (isa_check && !isa_check(state)) {
+ return;
+ }
+
+ const size_t elements = state.range(0);
+ std::vector<float, AlignedAllocator<float, 64>> input(elements);
+ std::vector<float, AlignedAllocator<float, 64>> output(elements);
+
+ std::random_device random_device;
+ auto rng = std::mt19937(random_device());
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-5.0f, 5.0f), std::ref(rng));
+ std::generate(input.begin(), input.end(), std::ref(f32rng));
+ std::fill(output.begin(), output.end(), std::nanf(""));
+
+ union xnn_f32_lrelu_params params;
+ xnn_init_f32_lrelu_params(¶ms, 0.01f);
+ for (auto _ : state) {
+ vlrelu(elements * sizeof(float), input.data(), output.data(), ¶ms);
+ }
+
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+ if (cpu_frequency != 0) {
+ state.counters["cpufreq"] = cpu_frequency;
+ }
+
+ const size_t elements_per_iteration = elements;
+ state.counters["elements"] =
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+ const size_t bytes_per_iteration = 2 * elements * sizeof(float);
+ state.counters["bytes"] =
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+
+#if XNN_ARCH_ARM64 || XNN_ARCH_ARM64
+ BENCHMARK_CAPTURE(f32_vlrelu, neon_x4,
+ xnn_f32_vlrelu_ukernel__neon_x4,
+ benchmark::utils::CheckNEON)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_vlrelu, neon_x8,
+ xnn_f32_vlrelu_ukernel__neon_x8,
+ benchmark::utils::CheckNEON)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+#endif // XNN_ARCH_ARM64 || XNN_ARCH_ARM64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ BENCHMARK_CAPTURE(f32_vlrelu, sse_x4,
+ xnn_f32_vlrelu_ukernel__sse_x4)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_vlrelu, sse_x8,
+ xnn_f32_vlrelu_ukernel__sse_x8)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+
+ BENCHMARK_CAPTURE(f32_vlrelu, sse2_x4,
+ xnn_f32_vlrelu_ukernel__sse2_x4)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_vlrelu, sse2_x8,
+ xnn_f32_vlrelu_ukernel__sse2_x8)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+
+ BENCHMARK_CAPTURE(f32_vlrelu, sse41_x4,
+ xnn_f32_vlrelu_ukernel__sse41_x4)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_vlrelu, sse41_x8,
+ xnn_f32_vlrelu_ukernel__sse41_x8)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+
+ BENCHMARK_CAPTURE(f32_vlrelu, avx_x8,
+ xnn_f32_vlrelu_ukernel__avx_x8)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_vlrelu, avx_x16,
+ xnn_f32_vlrelu_ukernel__avx_x16)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+
+ BENCHMARK_CAPTURE(f32_vlrelu, avx512f_x16,
+ xnn_f32_vlrelu_ukernel__avx512f_x16)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_vlrelu, avx512f_x32,
+ xnn_f32_vlrelu_ukernel__avx512f_x32)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_WASMSIMD
+ BENCHMARK_CAPTURE(f32_vlrelu, wasmsimd_bitselect_x4,
+ xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x4)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_vlrelu, wasmsimd_bitselect_x8,
+ xnn_f32_vlrelu_ukernel__wasmsimd_bitselect_x8)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+
+ BENCHMARK_CAPTURE(f32_vlrelu, wasmsimd_minmax_x4,
+ xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x4)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_vlrelu, wasmsimd_minmax_x8,
+ xnn_f32_vlrelu_ukernel__wasmsimd_minmax_x8)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+#endif // XNN_ARCH_WASMSIMD
+
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+ BENCHMARK_CAPTURE(f32_vlrelu, wasm_x1,
+ xnn_f32_vlrelu_ukernel__wasm_x1)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_vlrelu, wasm_x2,
+ xnn_f32_vlrelu_ukernel__wasm_x2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_vlrelu, wasm_x4,
+ xnn_f32_vlrelu_ukernel__wasm_x4)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+BENCHMARK_CAPTURE(f32_vlrelu, scalar_x1,
+ xnn_f32_vlrelu_ukernel__scalar_x1)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+BENCHMARK_CAPTURE(f32_vlrelu, scalar_x2,
+ xnn_f32_vlrelu_ukernel__scalar_x2)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+BENCHMARK_CAPTURE(f32_vlrelu, scalar_x4,
+ xnn_f32_vlrelu_ukernel__scalar_x4)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<float, float>)
+ ->UseRealTime();
+
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif