blob: 5cbe181ba6e9902c9768e36cd3c55c69381526d8 [file] [log] [blame]
Marat Dukhan4c4eb002019-12-08 21:27:49 -08001#include <algorithm>
2#include <cfloat>
3#include <chrono>
4#include <cmath>
5#include <functional>
6#include <random>
7#include <vector>
8
9#include "bench/utils.h"
10#include <xnnpack/AlignedAllocator.h>
11#include <xnnpack/common.h>
12#include <xnnpack/params.h>
13#include <xnnpack/rmax.h>
14#include <xnnpack/raddexpminusmax.h>
15#include <xnnpack/vscaleexpminusmax.h>
16
17#include <benchmark/benchmark.h>
18
19
20static void f32_vscaleexpminusmax(
21 benchmark::State& state,
22 xnn_f32_rmax_ukernel_function rmax,
23 xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,
24 xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,
25 benchmark::utils::IsaCheckFunction isa_check = nullptr)
26{
27 if (isa_check && !isa_check(state)) {
28 return;
29 }
30
31 const size_t n = state.range(0);
32 const size_t cache_line_size_max = 128;
33 const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float));
34
35 std::random_device random_device;
36 auto rng = std::mt19937(random_device());
37 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), rng);
38
39 const size_t num_buffers = 1 +
40 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float));
41 std::vector<float, AlignedAllocator<float, 64>> x(n);
42 std::vector<float, AlignedAllocator<float, 64>> y(packed_n * num_buffers);
43
44 std::generate(x.begin(), x.end(), std::ref(f32rng));
45
46 benchmark::utils::DisableDenormals();
47
48 size_t buffer_index = 0;
49 for (auto _ : state) {
50 state.PauseTiming();
51 float x_max = nanf("");
52 rmax(n * sizeof(float), x.data(), &x_max);
53 float y_sum = nanf("");
54 raddexpminusmax(n * sizeof(float), x.data(), &y_sum, x_max);
55 if (++buffer_index == num_buffers) {
56 buffer_index = 0;
57 }
58 state.ResumeTiming();
59
60 vscaleexpminusmax(n * sizeof(float), x.data(), y.data() + packed_n * buffer_index, x_max, 1.0f / y_sum);
61 }
62
63 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
64 state.counters["elements"] =
65 benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate);
66 state.counters["bytes"] =
67 benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate);
68}
69
70static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
71 b->ArgName("N");
72 for (int32_t n = 10000; n <= 100000000; n *= 10) {
73 b->Arg(n);
74 }
75}
76
77#if XNN_ARCH_X86 || XNN_ARCH_X86_64
78 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x16,
79 xnn_f32_rmax_ukernel__avx512f,
80 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
81 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16,
82 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
83 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x32,
84 xnn_f32_rmax_ukernel__avx512f,
85 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
86 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x32,
87 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
88 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x48,
89 xnn_f32_rmax_ukernel__avx512f,
90 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
91 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x48,
92 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
93 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x64,
94 xnn_f32_rmax_ukernel__avx512f,
95 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
96 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x64,
97 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
98 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x80,
99 xnn_f32_rmax_ukernel__avx512f,
100 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
101 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x80,
102 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
103 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x96,
104 xnn_f32_rmax_ukernel__avx512f,
105 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
106 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x96,
107 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
108 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x112,
109 xnn_f32_rmax_ukernel__avx512f,
110 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
111 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x112,
112 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
113 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x128,
114 xnn_f32_rmax_ukernel__avx512f,
115 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
116 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x128,
117 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
118 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x144,
119 xnn_f32_rmax_ukernel__avx512f,
120 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
121 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x144,
122 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
123 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x160,
124 xnn_f32_rmax_ukernel__avx512f,
125 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
126 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x160,
127 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
128 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x176,
129 xnn_f32_rmax_ukernel__avx512f,
130 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
131 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x176,
132 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
133 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x192,
134 xnn_f32_rmax_ukernel__avx512f,
135 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
136 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x192,
137 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
138
139 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x8,
140 xnn_f32_rmax_ukernel__avx,
141 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
142 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8,
143 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
144 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x16,
145 xnn_f32_rmax_ukernel__avx,
146 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
147 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16,
148 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
149 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x24,
150 xnn_f32_rmax_ukernel__avx,
151 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
152 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24,
153 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
154 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x32,
155 xnn_f32_rmax_ukernel__avx,
156 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
157 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x32,
158 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
159 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x40,
160 xnn_f32_rmax_ukernel__avx,
161 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
162 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x40,
163 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
164 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x48,
165 xnn_f32_rmax_ukernel__avx,
166 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
167 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x48,
168 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
169 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x56,
170 xnn_f32_rmax_ukernel__avx,
171 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
172 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x56,
173 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
174 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x64,
175 xnn_f32_rmax_ukernel__avx,
176 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
177 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x64,
178 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
179 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x72,
180 xnn_f32_rmax_ukernel__avx,
181 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
182 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x72,
183 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
184 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x80,
185 xnn_f32_rmax_ukernel__avx,
186 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
187 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x80,
188 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
189 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x88,
190 xnn_f32_rmax_ukernel__avx,
191 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
192 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x88,
193 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
194 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x96,
195 xnn_f32_rmax_ukernel__avx,
196 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
197 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x96,
198 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
199#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
200
201#ifndef XNNPACK_BENCHMARK_NO_MAIN
202BENCHMARK_MAIN();
203#endif