blob: 568a982f2a63fcd635fdd12402872af12694da4f [file] [log] [blame]
Marat Dukhan4c4eb002019-12-08 21:27:49 -08001#include <algorithm>
2#include <cfloat>
3#include <chrono>
4#include <cmath>
5#include <functional>
6#include <random>
7#include <vector>
8
9#include "bench/utils.h"
10#include <xnnpack/AlignedAllocator.h>
11#include <xnnpack/common.h>
12#include <xnnpack/params.h>
13#include <xnnpack/rmax.h>
14#include <xnnpack/raddexpminusmax.h>
15#include <xnnpack/vscaleexpminusmax.h>
16
17#include <benchmark/benchmark.h>
18
19
20static void f32_vscaleexpminusmax(
21 benchmark::State& state,
22 xnn_f32_rmax_ukernel_function rmax,
23 xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,
24 xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax,
25 benchmark::utils::IsaCheckFunction isa_check = nullptr)
26{
27 if (isa_check && !isa_check(state)) {
28 return;
29 }
30
Marat Dukhand713e8a2020-12-04 14:23:12 -080031 const size_t elements = state.range(0);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080032 const size_t cache_line_size_max = 128;
Marat Dukhand713e8a2020-12-04 14:23:12 -080033 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
Marat Dukhan4c4eb002019-12-08 21:27:49 -080034
35 std::random_device random_device;
36 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -070037 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
Marat Dukhan4c4eb002019-12-08 21:27:49 -080038
39 const size_t num_buffers = 1 +
Marat Dukhand713e8a2020-12-04 14:23:12 -080040 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
41 std::vector<float, AlignedAllocator<float, 64>> x(elements);
42 std::vector<float, AlignedAllocator<float, 64>> y(packed_elements * num_buffers);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080043
44 std::generate(x.begin(), x.end(), std::ref(f32rng));
45
46 benchmark::utils::DisableDenormals();
47
48 size_t buffer_index = 0;
49 for (auto _ : state) {
50 state.PauseTiming();
51 float x_max = nanf("");
Marat Dukhand713e8a2020-12-04 14:23:12 -080052 rmax(elements * sizeof(float), x.data(), &x_max);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080053 float y_sum = nanf("");
Marat Dukhand713e8a2020-12-04 14:23:12 -080054 raddexpminusmax(elements * sizeof(float), x.data(), &y_sum, x_max);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080055 if (++buffer_index == num_buffers) {
56 buffer_index = 0;
57 }
58 state.ResumeTiming();
59
Marat Dukhand713e8a2020-12-04 14:23:12 -080060 vscaleexpminusmax(elements * sizeof(float), x.data(), y.data() + packed_elements * buffer_index, x_max, 1.0f / y_sum);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080061 }
62
Marat Dukhand713e8a2020-12-04 14:23:12 -080063 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
64 if (cpu_frequency != 0) {
65 state.counters["cpufreq"] = cpu_frequency;
66 }
67
68 const size_t elements_per_iteration = elements;
Marat Dukhan4c4eb002019-12-08 21:27:49 -080069 state.counters["elements"] =
Marat Dukhand713e8a2020-12-04 14:23:12 -080070 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
71
72 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080073 state.counters["bytes"] =
Marat Dukhand713e8a2020-12-04 14:23:12 -080074 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080075}
76
77static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
78 b->ArgName("N");
79 for (int32_t n = 10000; n <= 100000000; n *= 10) {
80 b->Arg(n);
81 }
82}
83
84#if XNN_ARCH_X86 || XNN_ARCH_X86_64
85 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x16,
86 xnn_f32_rmax_ukernel__avx512f,
87 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
88 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16,
89 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
90 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x32,
91 xnn_f32_rmax_ukernel__avx512f,
92 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
93 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x32,
94 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
95 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x48,
96 xnn_f32_rmax_ukernel__avx512f,
97 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
98 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x48,
99 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
100 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x64,
101 xnn_f32_rmax_ukernel__avx512f,
102 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
103 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x64,
104 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
105 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x80,
106 xnn_f32_rmax_ukernel__avx512f,
107 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
108 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x80,
109 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
110 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x96,
111 xnn_f32_rmax_ukernel__avx512f,
112 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
113 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x96,
114 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
115 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x112,
116 xnn_f32_rmax_ukernel__avx512f,
117 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
118 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x112,
119 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
120 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x128,
121 xnn_f32_rmax_ukernel__avx512f,
122 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
123 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x128,
124 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
125 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x144,
126 xnn_f32_rmax_ukernel__avx512f,
127 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
128 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x144,
129 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
130 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x160,
131 xnn_f32_rmax_ukernel__avx512f,
132 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
133 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x160,
134 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
135 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x176,
136 xnn_f32_rmax_ukernel__avx512f,
137 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
138 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x176,
139 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
140 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx512f_p5_scalef_x192,
141 xnn_f32_rmax_ukernel__avx512f,
142 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
143 xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x192,
144 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
145
146 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x8,
147 xnn_f32_rmax_ukernel__avx,
148 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
149 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x8,
150 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
151 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x16,
152 xnn_f32_rmax_ukernel__avx,
153 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
154 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16,
155 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
156 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x24,
157 xnn_f32_rmax_ukernel__avx,
158 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
159 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24,
160 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
161 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x32,
162 xnn_f32_rmax_ukernel__avx,
163 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
164 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x32,
165 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
166 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x40,
167 xnn_f32_rmax_ukernel__avx,
168 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
169 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x40,
170 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
171 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x48,
172 xnn_f32_rmax_ukernel__avx,
173 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
174 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x48,
175 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
176 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x56,
177 xnn_f32_rmax_ukernel__avx,
178 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
179 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x56,
180 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
181 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x64,
182 xnn_f32_rmax_ukernel__avx,
183 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
184 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x64,
185 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
186 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x72,
187 xnn_f32_rmax_ukernel__avx,
188 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
189 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x72,
190 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
191 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x80,
192 xnn_f32_rmax_ukernel__avx,
193 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
194 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x80,
195 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
196 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x88,
197 xnn_f32_rmax_ukernel__avx,
198 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
199 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x88,
200 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
201 BENCHMARK_CAPTURE(f32_vscaleexpminusmax, avx2_p5_x96,
202 xnn_f32_rmax_ukernel__avx,
203 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
204 xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x96,
205 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
206#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
207
208#ifndef XNNPACK_BENCHMARK_NO_MAIN
209BENCHMARK_MAIN();
210#endif