Benchmarks for F32->QS8 and F32->QU8 VCVT microkernels
PiperOrigin-RevId: 413774728
diff --git a/bench/f32-qu8-vcvt.cc b/bench/f32-qu8-vcvt.cc
new file mode 100644
index 0000000..08c1d51
--- /dev/null
+++ b/bench/f32-qu8-vcvt.cc
@@ -0,0 +1,210 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <algorithm>
+#include <cmath>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+#include "bench/utils.h"
+
+#include <fp16/fp16.h>
+#include <xnnpack/AlignedAllocator.h>
+#include <xnnpack/common.h>
+#include <xnnpack/params.h>
+#include <xnnpack/params-init.h>
+#include <xnnpack/vcvt.h>
+
+
+static void f32_qu8_vcvt(
+ benchmark::State& state,
+ xnn_f32_qu8_vcvt_ukernel_function cvt,
+ xnn_init_f32_qu8_cvt_params_fn init_params,
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
+ if (isa_check && !isa_check(state)) {
+ return;
+ }
+
+ const size_t num_elements = state.range(0);
+
+ std::random_device random_device;
+ auto rng = std::mt19937(random_device());
+ auto f32rng = std::bind(std::uniform_real_distribution<float>(-10.0f, 10.0f), std::ref(rng));
+
+ std::vector<float, AlignedAllocator<float, 64>> x(num_elements + XNN_EXTRA_BYTES / sizeof(float));
+ std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> y(num_elements);
+ std::generate(x.begin(), x.end(), std::ref(f32rng));
+ std::fill(y.begin(), y.end(), UINT8_C(0xA5));
+
+ xnn_f32_qu8_cvt_params params;
+ init_params(¶ms,
+ 25.0f /* scale */,
+ 127 /* output zero point */,
+ std::numeric_limits<uint8_t>::min() + 1 /* output min */,
+ std::numeric_limits<uint8_t>::max() - 1 /* output max */);
+ for (auto _ : state) {
+ cvt(num_elements * sizeof(uint8_t), x.data(), y.data(), ¶ms);
+ }
+
+ const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
+ if (cpu_frequency != 0) {
+ state.counters["cpufreq"] = cpu_frequency;
+ }
+
+ const size_t elements_per_iteration = num_elements;
+ state.counters["elements"] =
+ benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
+
+ const size_t bytes_per_iteration = num_elements * (sizeof(int8_t) + sizeof(float));
+ state.counters["bytes"] =
+ benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
+}
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, neonv8_x8,
+ xnn_f32_qu8_vcvt_ukernel__neonv8_x8,
+ xnn_init_f32_qu8_cvt_neonv8_params,
+ benchmark::utils::CheckNEONV8)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, neonv8_x16,
+ xnn_f32_qu8_vcvt_ukernel__neonv8_x16,
+ xnn_init_f32_qu8_cvt_neonv8_params,
+ benchmark::utils::CheckNEONV8)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, neonv8_x24,
+ xnn_f32_qu8_vcvt_ukernel__neonv8_x24,
+ xnn_init_f32_qu8_cvt_neonv8_params,
+ benchmark::utils::CheckNEONV8)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, neonv8_x32,
+ xnn_f32_qu8_vcvt_ukernel__neonv8_x32,
+ xnn_init_f32_qu8_cvt_neonv8_params,
+ benchmark::utils::CheckNEONV8)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, neon_x8,
+ xnn_f32_qu8_vcvt_ukernel__neon_x8,
+ xnn_init_f32_qu8_cvt_neon_params,
+ benchmark::utils::CheckNEON)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, neon_x16,
+ xnn_f32_qu8_vcvt_ukernel__neon_x16,
+ xnn_init_f32_qu8_cvt_neon_params,
+ benchmark::utils::CheckNEON)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, neon_x24,
+ xnn_f32_qu8_vcvt_ukernel__neon_x24,
+ xnn_init_f32_qu8_cvt_neon_params,
+ benchmark::utils::CheckNEON)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, neon_x32,
+ xnn_f32_qu8_vcvt_ukernel__neon_x32,
+ xnn_init_f32_qu8_cvt_neon_params,
+ benchmark::utils::CheckNEON)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, sse2_x8,
+ xnn_f32_qu8_vcvt_ukernel__sse2_x8,
+ xnn_init_f32_qu8_cvt_sse2_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, sse2_x16,
+ xnn_f32_qu8_vcvt_ukernel__sse2_x16,
+ xnn_init_f32_qu8_cvt_sse2_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, sse2_x24,
+ xnn_f32_qu8_vcvt_ukernel__sse2_x24,
+ xnn_init_f32_qu8_cvt_sse2_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, sse2_x32,
+ xnn_f32_qu8_vcvt_ukernel__sse2_x32,
+ xnn_init_f32_qu8_cvt_sse2_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_WASMSIMD
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_cvt_x8,
+ xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_x8,
+ xnn_init_f32_qu8_cvt_wasmsimd_cvt_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_cvt_x16,
+ xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_x16,
+ xnn_init_f32_qu8_cvt_wasmsimd_cvt_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_cvt_x24,
+ xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_x24,
+ xnn_init_f32_qu8_cvt_wasmsimd_cvt_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_cvt_x32,
+ xnn_f32_qu8_vcvt_ukernel__wasmsimd_cvt_x32,
+ xnn_init_f32_qu8_cvt_wasmsimd_cvt_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_magic_x8,
+ xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x8,
+ xnn_init_f32_qu8_cvt_wasmsimd_magic_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_magic_x16,
+ xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x16,
+ xnn_init_f32_qu8_cvt_wasmsimd_magic_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_magic_x24,
+ xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x24,
+ xnn_init_f32_qu8_cvt_wasmsimd_magic_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(f32_qu8_vcvt, wasmsimd_magic_x32,
+ xnn_f32_qu8_vcvt_ukernel__wasmsimd_magic_x32,
+ xnn_init_f32_qu8_cvt_wasmsimd_magic_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+#endif // XNN_ARCH_WASMSIMD
+
+BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_magic_x1,
+ xnn_f32_qu8_vcvt_ukernel__scalar_magic_x1,
+ xnn_init_f32_qu8_cvt_scalar_magic_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_magic_x2,
+ xnn_f32_qu8_vcvt_ukernel__scalar_magic_x2,
+ xnn_init_f32_qu8_cvt_scalar_magic_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_magic_x3,
+ xnn_f32_qu8_vcvt_ukernel__scalar_magic_x3,
+ xnn_init_f32_qu8_cvt_scalar_magic_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+BENCHMARK_CAPTURE(f32_qu8_vcvt, scalar_magic_x4,
+ xnn_f32_qu8_vcvt_ukernel__scalar_magic_x4,
+ xnn_init_f32_qu8_cvt_scalar_magic_params)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
+ ->UseRealTime();
+
+#ifndef XNNPACK_BENCHMARK_NO_MAIN
+BENCHMARK_MAIN();
+#endif