Re-factor x32 transpose bench

PiperOrigin-RevId: 424050952
diff --git a/bench/x32-transpose.cc b/bench/x32-transpose.cc
index 7f6f6b3..8e54ea5 100644
--- a/bench/x32-transpose.cc
+++ b/bench/x32-transpose.cc
@@ -11,36 +11,35 @@
 #include <algorithm>
 #include <cmath>
 #include <functional>
-#include <random>
+#include <numeric>
 #include <vector>
 
 #include "bench/utils.h"
 #include <benchmark/benchmark.h>
 
-static void x32_transpose(
+void transpose(
     benchmark::State& state,
     xnn_x32_transpose_ukernel_function transpose,
-    size_t ukernel_size,
-    benchmark::utils::IsaCheckFunction isa_check = nullptr) {
+    benchmark::utils::IsaCheckFunction isa_check = nullptr)
+{
   if (isa_check && !isa_check(state)) {
     return;
   }
-
-  std::random_device random_device;
-  auto rng = std::mt19937(random_device());
-  auto u32rng = std::bind(std::uniform_int_distribution<uint32_t>(), rng);
-  const size_t ukernel_bytes = ukernel_size * sizeof(uint32_t);
+  const size_t height = state.range(0);
+  const size_t width = state.range(1);
+  const size_t tile_hbytes = height * sizeof(uint32_t);
+  const size_t tile_wbytes = width * sizeof(uint32_t);
 
   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> x(
-      ukernel_size * ukernel_size + XNN_EXTRA_BYTES / sizeof(uint32_t));
+      height * width + XNN_EXTRA_BYTES / sizeof(uint32_t));
   std::vector<uint32_t, AlignedAllocator<uint32_t, 64>> y(
-      ukernel_size * ukernel_size + XNN_EXTRA_BYTES / sizeof(uint32_t));
-  std::generate(x.begin(), x.end(), std::ref(u32rng));
+      height * width + XNN_EXTRA_BYTES / sizeof(uint32_t));
+  std::iota(x.begin(), x.end(), 0);
   std::fill(y.begin(), y.end(), 0);
 
   for (auto _ : state) {
-    transpose(x.data(), y.data(), ukernel_bytes, ukernel_bytes, ukernel_size,
-              ukernel_size);
+    transpose(x.data(), y.data(), tile_wbytes, tile_hbytes, width,
+              height);
   }
 
   const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
@@ -49,57 +48,76 @@
   }
 }
 
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_1x2, xnn_x32_transpose_ukernel__1x2_scalar_int, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_1x4, xnn_x32_transpose_ukernel__1x4_scalar_int, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_2x1, xnn_x32_transpose_ukernel__2x1_scalar_int, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_2x2, xnn_x32_transpose_ukernel__2x2_scalar_int, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_2x4, xnn_x32_transpose_ukernel__2x4_scalar_int, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_4x1, xnn_x32_transpose_ukernel__4x1_scalar_int, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_4x2, xnn_x32_transpose_ukernel__4x2_scalar_int, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_int_32_4x4, xnn_x32_transpose_ukernel__4x4_scalar_int, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_1x2, xnn_x32_transpose_ukernel__1x2_scalar_float, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_1x4, xnn_x32_transpose_ukernel__1x4_scalar_float, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_2x1, xnn_x32_transpose_ukernel__2x1_scalar_float, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_2x2, xnn_x32_transpose_ukernel__2x2_scalar_float, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_2x4, xnn_x32_transpose_ukernel__2x4_scalar_float, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_4x1, xnn_x32_transpose_ukernel__4x1_scalar_float, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_4x2, xnn_x32_transpose_ukernel__4x2_scalar_float, 32)
-    ->UseRealTime();
-BENCHMARK_CAPTURE(x32_transpose, scalar_float_32_4x4, xnn_x32_transpose_ukernel__4x4_scalar_float, 32)
-    ->UseRealTime();
+static void BenchmarkKernelSize(benchmark::internal::Benchmark* b)
+{
+  b->ArgNames({"height", "width"});
+  b->Args({32, 32});
+  b->Args({64, 64});
+  b->Args({117, 117});
+  b->Args({1024, 1024});
+}
+
+BENCHMARK_CAPTURE(transpose, 1x2_scalar_int, xnn_x32_transpose_ukernel__1x2_scalar_int)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 1x4_scalar_int, xnn_x32_transpose_ukernel__1x4_scalar_int)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 2x1_scalar_int, xnn_x32_transpose_ukernel__2x1_scalar_int)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 2x2_scalar_int, xnn_x32_transpose_ukernel__2x2_scalar_int)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 2x4_scalar_int, xnn_x32_transpose_ukernel__2x4_scalar_int)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 4x1_scalar_int, xnn_x32_transpose_ukernel__4x1_scalar_int)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 4x2_scalar_int, xnn_x32_transpose_ukernel__4x2_scalar_int)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 4x4_scalar_int, xnn_x32_transpose_ukernel__4x4_scalar_int)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 1x2_scalar_float, xnn_x32_transpose_ukernel__1x2_scalar_float)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 1x4_scalar_float, xnn_x32_transpose_ukernel__1x4_scalar_float)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 2x1_scalar_float, xnn_x32_transpose_ukernel__2x1_scalar_float)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 2x2_scalar_float, xnn_x32_transpose_ukernel__2x2_scalar_float)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 2x4_scalar_float, xnn_x32_transpose_ukernel__2x4_scalar_float)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 4x1_scalar_float, xnn_x32_transpose_ukernel__4x1_scalar_float)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 4x2_scalar_float, xnn_x32_transpose_ukernel__4x2_scalar_float)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
+BENCHMARK_CAPTURE(transpose, 4x4_scalar_float, xnn_x32_transpose_ukernel__4x4_scalar_float)
+    ->Apply(BenchmarkKernelSize)->UseRealTime();
 
 #if XNN_ARCH_ARM64
-  BENCHMARK_CAPTURE(x32_transpose, aarch64_32_tbl, xnn_x32_transpose_ukernel__4x4_aarch64_neon_tbl, 32)
-      ->UseRealTime();
-  BENCHMARK_CAPTURE(x32_transpose, aarch64_117_tbl, xnn_x32_transpose_ukernel__4x4_aarch64_neon_tbl, 117)
-      ->UseRealTime();
-  BENCHMARK_CAPTURE(x32_transpose, aarch64_1024_tbl, xnn_x32_transpose_ukernel__4x4_aarch64_neon_tbl, 1024)
-      ->UseRealTime();
+  BENCHMARK_CAPTURE(transpose, 4x4_aarch64_neon_tbl, xnn_x32_transpose_ukernel__4x4_aarch64_neon_tbl)
+      ->Apply(BenchmarkKernelSize)->UseRealTime();
 #endif  // XNN_ARCH_ARM64
 
+#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+  BENCHMARK_CAPTURE(transpose, 4x4_wasmsimd, xnn_x32_transpose_ukernel__4x4_wasmsimd)
+      ->Apply(BenchmarkKernelSize)->UseRealTime();
+#endif  // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
+
 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
-  BENCHMARK_CAPTURE(x32_transpose, sse_32, xnn_x32_transpose_ukernel__4x4_sse, 32)
-      ->UseRealTime();
-  BENCHMARK_CAPTURE(x32_transpose, sse_117, xnn_x32_transpose_ukernel__4x4_sse, 117)
-      ->UseRealTime();
-  BENCHMARK_CAPTURE(x32_transpose, sse_1024, xnn_x32_transpose_ukernel__4x4_sse, 1024)
-      ->UseRealTime();
+  BENCHMARK_CAPTURE(transpose, 4x4_sse, xnn_x32_transpose_ukernel__4x4_sse)
+      ->Apply(BenchmarkKernelSize)->UseRealTime();
+  BENCHMARK_CAPTURE(transpose, 4x4_multi_dec_sse2, xnn_x32_transpose_ukernel__4x4_multi_dec_sse2)
+      ->Apply(BenchmarkKernelSize)->UseRealTime();
+  BENCHMARK_CAPTURE(transpose, 4x4_multi_multi_sse2, xnn_x32_transpose_ukernel__4x4_multi_multi_sse2)
+      ->Apply(BenchmarkKernelSize)->UseRealTime();
+  BENCHMARK_CAPTURE(transpose, 4x4_multi_switch_sse2, xnn_x32_transpose_ukernel__4x4_multi_switch_sse2)
+      ->Apply(BenchmarkKernelSize)->UseRealTime();
+  BENCHMARK_CAPTURE(transpose, 4x4_reuse_dec_sse2, xnn_x32_transpose_ukernel__4x4_reuse_dec_sse2)
+      ->Apply(BenchmarkKernelSize)->UseRealTime();
+  BENCHMARK_CAPTURE(transpose, 4x4_reuse_multi_sse2, xnn_x32_transpose_ukernel__4x4_reuse_multi_sse2)
+      ->Apply(BenchmarkKernelSize)->UseRealTime();
+  BENCHMARK_CAPTURE(transpose, 4x4_reuse_switch_sse2, xnn_x32_transpose_ukernel__4x4_reuse_switch_sse2)
+      ->Apply(BenchmarkKernelSize)->UseRealTime();
 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
 
+
 #ifndef XNNPACK_BENCHMARK_NO_MAIN
 BENCHMARK_MAIN();
 #endif