blob: bf157070249f2665de0df56866a5063ef6b5b136 [file] [log] [blame]
Marat Dukhan54074372021-09-08 23:28:46 -07001// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <array>
8#include <cmath>
9#include <functional>
10#include <random>
11#include <vector>
12
13#include <benchmark/benchmark.h>
14#include "bench/utils.h"
15
16#include <xnnpack/AlignedAllocator.h>
17#include <xnnpack/common.h>
18#include <xnnpack/params.h>
19#include <xnnpack/lut.h>
20
21
22static void x8_lut(
23 benchmark::State& state,
24 xnn_x8_lut_ukernel_function lut,
25 benchmark::utils::IsaCheckFunction isa_check = nullptr)
26{
27 if (isa_check && !isa_check(state)) {
28 return;
29 }
30
31 const size_t num_elements = state.range(0);
32 std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> input(num_elements);
33 std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> output(num_elements);
34 std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> table(256);
35
36 std::random_device random_device;
37 auto rng = std::mt19937(random_device());
38 auto u8rng = std::bind(
39 std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
40 std::generate(input.begin(), input.end(), std::ref(u8rng));
41 std::generate(table.begin(), table.end(), std::ref(u8rng));
42 std::fill(output.begin(), output.end(), UINT8_C(0xAA));
43
44 for (auto _ : state) {
45 lut(num_elements * sizeof(uint8_t), input.data(), output.data(), table.data());
46 }
47
48 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
49 if (cpu_frequency != 0) {
50 state.counters["cpufreq"] = cpu_frequency;
51 }
52
53 const size_t elements_per_iteration = num_elements;
54 state.counters["elements"] =
55 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
56
57 const size_t bytes_per_iteration = 2 * num_elements * sizeof(uint8_t);
58 state.counters["bytes"] =
59 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
60}
61
Marat Dukhanf7182322021-09-09 18:53:46 -070062#if XNN_ARCH_ARM64
63 BENCHMARK_CAPTURE(x8_lut, neon_tbx128x4_x16,
64 xnn_x8_lut_ukernel__neon_tbx128x4_x16)
65 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
66 ->UseRealTime();
67 BENCHMARK_CAPTURE(x8_lut, neon_tbx128x4_x32,
68 xnn_x8_lut_ukernel__neon_tbx128x4_x32)
69 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
70 ->UseRealTime();
71 BENCHMARK_CAPTURE(x8_lut, neon_tbx128x4_x48,
72 xnn_x8_lut_ukernel__neon_tbx128x4_x48)
73 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
74 ->UseRealTime();
75 BENCHMARK_CAPTURE(x8_lut, neon_tbx128x4_x64,
76 xnn_x8_lut_ukernel__neon_tbx128x4_x64)
77 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
78 ->UseRealTime();
79#endif // XNN_ARCH_ARM64
80
Marat Dukhan7c478e32021-09-10 09:48:13 -070081#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhan2b3c4102021-09-10 19:05:37 -070082 BENCHMARK_CAPTURE(x8_lut, avx512skx_vpshufb_x64,
83 xnn_x8_lut_ukernel__avx512skx_vpshufb_x64,
84 benchmark::utils::CheckAVX512SKX)
85 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
86 ->UseRealTime();
87 BENCHMARK_CAPTURE(x8_lut, avx512skx_vpshufb_x128,
88 xnn_x8_lut_ukernel__avx512skx_vpshufb_x128,
89 benchmark::utils::CheckAVX512SKX)
90 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
91 ->UseRealTime();
92 BENCHMARK_CAPTURE(x8_lut, avx512skx_vpshufb_x192,
93 xnn_x8_lut_ukernel__avx512skx_vpshufb_x192,
94 benchmark::utils::CheckAVX512SKX)
95 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
96 ->UseRealTime();
97 BENCHMARK_CAPTURE(x8_lut, avx512skx_vpshufb_x256,
98 xnn_x8_lut_ukernel__avx512skx_vpshufb_x256,
99 benchmark::utils::CheckAVX512SKX)
100 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
101 ->UseRealTime();
102
Marat Dukhan7c478e32021-09-10 09:48:13 -0700103 BENCHMARK_CAPTURE(x8_lut, avx2_x32,
104 xnn_x8_lut_ukernel__avx2_x32,
105 benchmark::utils::CheckAVX2)
106 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
107 ->UseRealTime();
108 BENCHMARK_CAPTURE(x8_lut, avx2_x64,
109 xnn_x8_lut_ukernel__avx2_x64,
110 benchmark::utils::CheckAVX2)
111 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
112 ->UseRealTime();
113 BENCHMARK_CAPTURE(x8_lut, avx2_x96,
114 xnn_x8_lut_ukernel__avx2_x96,
115 benchmark::utils::CheckAVX2)
116 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
117 ->UseRealTime();
118 BENCHMARK_CAPTURE(x8_lut, avx2_x128,
119 xnn_x8_lut_ukernel__avx2_x128,
120 benchmark::utils::CheckAVX2)
121 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
122 ->UseRealTime();
123
124 BENCHMARK_CAPTURE(x8_lut, avx_x16,
125 xnn_x8_lut_ukernel__avx_x16,
126 benchmark::utils::CheckAVX)
127 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
128 ->UseRealTime();
129 BENCHMARK_CAPTURE(x8_lut, avx_x32,
130 xnn_x8_lut_ukernel__avx_x32,
131 benchmark::utils::CheckAVX)
132 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
133 ->UseRealTime();
134 BENCHMARK_CAPTURE(x8_lut, avx_x48,
135 xnn_x8_lut_ukernel__avx_x48,
136 benchmark::utils::CheckAVX)
137 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
138 ->UseRealTime();
139 BENCHMARK_CAPTURE(x8_lut, avx_x64,
140 xnn_x8_lut_ukernel__avx_x64,
141 benchmark::utils::CheckAVX)
142 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
143 ->UseRealTime();
144
145 BENCHMARK_CAPTURE(x8_lut, ssse3_x16,
146 xnn_x8_lut_ukernel__ssse3_x16,
147 benchmark::utils::CheckSSSE3)
148 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
149 ->UseRealTime();
150 BENCHMARK_CAPTURE(x8_lut, ssse3_x32,
151 xnn_x8_lut_ukernel__ssse3_x32,
152 benchmark::utils::CheckSSSE3)
153 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
154 ->UseRealTime();
155#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
156
Marat Dukhan4c617792021-12-21 15:47:58 -0800157#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhana4ad9882021-09-18 08:06:04 -0700158 BENCHMARK_CAPTURE(x8_lut, wasmsimd_x16,
159 xnn_x8_lut_ukernel__wasmsimd_x16)
160 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
161 ->UseRealTime();
162 BENCHMARK_CAPTURE(x8_lut, wasmsimd_x32,
163 xnn_x8_lut_ukernel__wasmsimd_x32)
164 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
165 ->UseRealTime();
166 BENCHMARK_CAPTURE(x8_lut, wasmsimd_x48,
167 xnn_x8_lut_ukernel__wasmsimd_x48)
168 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
169 ->UseRealTime();
170 BENCHMARK_CAPTURE(x8_lut, wasmsimd_x64,
171 xnn_x8_lut_ukernel__wasmsimd_x64)
172 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
173 ->UseRealTime();
Marat Dukhan4c617792021-12-21 15:47:58 -0800174#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhana4ad9882021-09-18 08:06:04 -0700175
Marat Dukhan54074372021-09-08 23:28:46 -0700176BENCHMARK_CAPTURE(x8_lut, scalar_x1,
177 xnn_x8_lut_ukernel__scalar_x1)
178 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
179 ->UseRealTime();
180BENCHMARK_CAPTURE(x8_lut, scalar_x2,
181 xnn_x8_lut_ukernel__scalar_x2)
182 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
183 ->UseRealTime();
184BENCHMARK_CAPTURE(x8_lut, scalar_x4,
185 xnn_x8_lut_ukernel__scalar_x4)
186 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
187 ->UseRealTime();
188BENCHMARK_CAPTURE(x8_lut, scalar_x8,
189 xnn_x8_lut_ukernel__scalar_x8)
190 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
191 ->UseRealTime();
192BENCHMARK_CAPTURE(x8_lut, scalar_x16,
193 xnn_x8_lut_ukernel__scalar_x16)
194 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
195 ->UseRealTime();
196
197#ifndef XNNPACK_BENCHMARK_NO_MAIN
198BENCHMARK_MAIN();
199#endif