Re-factor x32 transpose bench
PiperOrigin-RevId: 424050952
diff --git a/bench/x32-transpose.cc b/bench/x32-transpose.cc
index 7f6f6b3..8e54ea5 100644
--- a/bench/x32-transpose.cc
+++ b/bench/x32-transpose.cc
@@ -11,36 +11,35 @@
#include <algorithm>
#include <cmath>
#include <functional>
-#include <random>
+#include <numeric>
#include <vector>
#include "bench/utils.h"
#include <benchmark/benchmark.h>
-static void x32_transpose(
+void transpose(
benchmark::State& state,
xnn_x32_transpose_ukernel_function transpose,
- size_t ukernel_size,
- benchmark::utils::IsaCheckFunction isa_check = nullptr) {
+ benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
if (isa_check && !isa_check(state)) {
return;
}
-
- std::random_device random_device;
- auto rng = std::mt19937(random_device());
- auto u32rng = std::bind(std::uniform_int_distribution<uint32_t>(), rng);
- const size_t ukernel_bytes = ukernel_size * sizeof(uint32_t);
+ const size_t height = state.range(0);
+ const size_t width = state.range(1);
+ const size_t tile_hbytes = height * sizeof(uint32_t);
+ const size_t tile_wbytes = width * sizeof(uint32_t);
std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> x(
- ukernel_size * ukernel_size + XNN_EXTRA_BYTES / sizeof(uint32_t));
+ height * width + XNN_EXTRA_BYTES / sizeof(uint32_t));
std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> y(
- ukernel_size * ukernel_size + XNN_EXTRA_BYTES / sizeof(uint32_t));
- std::generate(x.begin(), x.end(), std::ref(u32rng));
+ height * width + XNN_EXTRA_BYTES / sizeof(uint32_t));
+ std::iota(x.begin(), x.end(), 0);
std::fill(y.begin(), y.end(), 0);
for (auto _ : state) {
- transpose(x.data(), y.data(), ukernel_bytes, ukernel_bytes, ukernel_size,
- ukernel_size);
+ transpose(x.data(), y.data(), tile_wbytes, tile_hbytes, width,
+ height);
}
const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
@@ -49,57 +48,76 @@
}
}
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_1x2, xnn_x32_transpose_ukernel__1x2_scalar_int, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_1x4, xnn_x32_transpose_ukernel__1x4_scalar_int, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_2x1, xnn_x32_transpose_ukernel__2x1_scalar_int, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_2x2, xnn_x32_transpose_ukernel__2x2_scalar_int, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_2x4, xnn_x32_transpose_ukernel__2x4_scalar_int, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_4x1, xnn_x32_transpose_ukernel__4x1_scalar_int, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_4x2, xnn_x32_transpose_ukernel__4x2_scalar_int, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_4x4, xnn_x32_transpose_ukernel__4x4_scalar_int, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_1x2, xnn_x32_transpose_ukernel__1x2_scalar_float, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_1x4, xnn_x32_transpose_ukernel__1x4_scalar_float, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_2x1, xnn_x32_transpose_ukernel__2x1_scalar_float, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_2x2, xnn_x32_transpose_ukernel__2x2_scalar_float, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_2x4, xnn_x32_transpose_ukernel__2x4_scalar_float, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_4x1, xnn_x32_transpose_ukernel__4x1_scalar_float, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_4x2, xnn_x32_transpose_ukernel__4x2_scalar_float, 32)
- ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_4x4, xnn_x32_transpose_ukernel__4x4_scalar_float, 32)
- ->UseRealTime();
+static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
+{
+ b->ArgNames({"height", "width"});
+ b->Args({32, 32});
+ b->Args({64, 64});
+ b->Args({117, 117});
+ b->Args({1024, 1024});
+}
+
+BENCHMARK_CAPTURE(transpose, 1x2_scalar_int, xnn_x32_transpose_ukernel__1x2_scalar_int)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 1x4_scalar_int, xnn_x32_transpose_ukernel__1x4_scalar_int)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 2x1_scalar_int, xnn_x32_transpose_ukernel__2x1_scalar_int)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 2x2_scalar_int, xnn_x32_transpose_ukernel__2x2_scalar_int)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 2x4_scalar_int, xnn_x32_transpose_ukernel__2x4_scalar_int)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 4x1_scalar_int, xnn_x32_transpose_ukernel__4x1_scalar_int)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 4x2_scalar_int, xnn_x32_transpose_ukernel__4x2_scalar_int)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 4x4_scalar_int, xnn_x32_transpose_ukernel__4x4_scalar_int)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 1x2_scalar_float, xnn_x32_transpose_ukernel__1x2_scalar_float)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 1x4_scalar_float, xnn_x32_transpose_ukernel__1x4_scalar_float)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 2x1_scalar_float, xnn_x32_transpose_ukernel__2x1_scalar_float)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 2x2_scalar_float, xnn_x32_transpose_ukernel__2x2_scalar_float)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 2x4_scalar_float, xnn_x32_transpose_ukernel__2x4_scalar_float)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 4x1_scalar_float, xnn_x32_transpose_ukernel__4x1_scalar_float)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 4x2_scalar_float, xnn_x32_transpose_ukernel__4x2_scalar_float)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 4x4_scalar_float, xnn_x32_transpose_ukernel__4x4_scalar_float)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
#if XNN_ARCH_ARM64
- BENCHMARK_CAPTURE(x32_transpose, aarch64_32_tbl, xnn_x32_transpose_ukernel__4x4_aarch64_neon_tbl, 32)
- ->UseRealTime();
- BENCHMARK_CAPTURE(x32_transpose, aarch64_117_tbl, xnn_x32_transpose_ukernel__4x4_aarch64_neon_tbl, 117)
- ->UseRealTime();
- BENCHMARK_CAPTURE(x32_transpose, aarch64_1024_tbl, xnn_x32_transpose_ukernel__4x4_aarch64_neon_tbl, 1024)
- ->UseRealTime();
+ BENCHMARK_CAPTURE(transpose, 4x4_aarch64_neon_tbl, xnn_x32_transpose_ukernel__4x4_aarch64_neon_tbl)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
#endif // XNN_ARCH_ARM64
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+ BENCHMARK_CAPTURE(transpose, 4x4_wasmsimd, xnn_x32_transpose_ukernel__4x4_wasmsimd)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
- BENCHMARK_CAPTURE(x32_transpose, sse_32, xnn_x32_transpose_ukernel__4x4_sse, 32)
- ->UseRealTime();
- BENCHMARK_CAPTURE(x32_transpose, sse_117, xnn_x32_transpose_ukernel__4x4_sse, 117)
- ->UseRealTime();
- BENCHMARK_CAPTURE(x32_transpose, sse_1024, xnn_x32_transpose_ukernel__4x4_sse, 1024)
- ->UseRealTime();
+ BENCHMARK_CAPTURE(transpose, 4x4_sse, xnn_x32_transpose_ukernel__4x4_sse)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+ BENCHMARK_CAPTURE(transpose, 4x4_multi_dec_sse2, xnn_x32_transpose_ukernel__4x4_multi_dec_sse2)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+ BENCHMARK_CAPTURE(transpose, 4x4_multi_multi_sse2, xnn_x32_transpose_ukernel__4x4_multi_multi_sse2)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+ BENCHMARK_CAPTURE(transpose, 4x4_multi_switch_sse2, xnn_x32_transpose_ukernel__4x4_multi_switch_sse2)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+ BENCHMARK_CAPTURE(transpose, 4x4_reuse_dec_sse2, xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+ BENCHMARK_CAPTURE(transpose, 4x4_reuse_multi_sse2, xnn_x32_transpose_ukernel__4x4_reuse_multi_sse2)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
+ BENCHMARK_CAPTURE(transpose, 4x4_reuse_switch_sse2, xnn_x32_transpose_ukernel__4x4_reuse_switch_sse2)
+ ->Apply(BenchmarkKernelSize)->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
#endif