blob: 50d46333d89d55ab876dd025a3c10e1406aa1e82 [file] [log] [blame]
Marat Dukhan4c4eb002019-12-08 21:27:49 -08001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cmath>
8#include <functional>
9#include <random>
10#include <vector>
11
12#include <benchmark/benchmark.h>
13#include "bench/utils.h"
14#include <xnnpack/AlignedAllocator.h>
15#include <xnnpack/common.h>
16#include <xnnpack/params.h>
17#include <xnnpack/raddexpminusmax.h>
18#include <xnnpack/rmax.h>
19
20
21static void f32_raddexpminusmax(
22 benchmark::State& state,
23 xnn_f32_rmax_ukernel_function rmax,
24 xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax,
25 benchmark::utils::IsaCheckFunction isa_check = nullptr)
26{
27 if (isa_check && !isa_check(state)) {
28 return;
29 }
30
Marat Dukhand713e8a2020-12-04 14:23:12 -080031 const size_t elements = state.range(0);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080032 const size_t cache_line_size_max = 128;
Marat Dukhand713e8a2020-12-04 14:23:12 -080033 const size_t packed_elements = benchmark::utils::RoundUp(elements, cache_line_size_max / sizeof(float));
Marat Dukhan4c4eb002019-12-08 21:27:49 -080034
35 std::random_device random_device;
36 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -070037 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), std::ref(rng));
Marat Dukhan4c4eb002019-12-08 21:27:49 -080038
39 const size_t num_buffers = 1 +
Marat Dukhand713e8a2020-12-04 14:23:12 -080040 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_elements * sizeof(float));
41 std::vector<float, AlignedAllocator<float, 64>> x(elements);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080042
43 std::generate(x.begin(), x.end(), std::ref(f32rng));
44
45 benchmark::utils::DisableDenormals();
46
47 size_t buffer_index = 0;
48 for (auto _ : state) {
49 state.PauseTiming();
50 float x_max = nanf("");
Marat Dukhand713e8a2020-12-04 14:23:12 -080051 rmax(elements * sizeof(float), x.data(), &x_max);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080052 if (++buffer_index == num_buffers) {
53 buffer_index = 0;
54 }
55 state.ResumeTiming();
56
57 float y_sum = nanf("");
Marat Dukhand713e8a2020-12-04 14:23:12 -080058 raddexpminusmax(elements * sizeof(float), x.data(), &y_sum, x_max);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080059 }
60
Marat Dukhand713e8a2020-12-04 14:23:12 -080061 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
62 if (cpu_frequency != 0) {
63 state.counters["cpufreq"] = cpu_frequency;
64 }
65
66 const size_t elements_per_iteration = elements;
Marat Dukhan4c4eb002019-12-08 21:27:49 -080067 state.counters["elements"] =
Marat Dukhand713e8a2020-12-04 14:23:12 -080068 benchmark::Counter(uint64_t(state.iterations()) * elements_per_iteration, benchmark::Counter::kIsRate);
69
70 const size_t bytes_per_iteration = 2 * elements * sizeof(float);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080071 state.counters["bytes"] =
Marat Dukhand713e8a2020-12-04 14:23:12 -080072 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
Marat Dukhan4c4eb002019-12-08 21:27:49 -080073}
74
75static void CharacteristicArguments(benchmark::internal::Benchmark* b) {
76 b->ArgName("N");
77 for (int32_t n = 10000; n <= 100000000; n *= 10) {
78 b->Arg(n);
79 }
80}
81
82#if XNN_ARCH_X86 || XNN_ARCH_X86_64
83 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x64,
84 xnn_f32_rmax_ukernel__avx,
85 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64,
86 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
87 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x64_acc2,
88 xnn_f32_rmax_ukernel__avx,
89 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc2,
90 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
91 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x64_acc4,
92 xnn_f32_rmax_ukernel__avx,
93 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc4,
94 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
95
96 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x72,
97 xnn_f32_rmax_ukernel__avx,
98 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72,
99 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
100 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x72_acc3,
101 xnn_f32_rmax_ukernel__avx,
102 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x72_acc3,
103 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
104
105 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x80,
106 xnn_f32_rmax_ukernel__avx,
107 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80,
108 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
109 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x80_acc2,
110 xnn_f32_rmax_ukernel__avx,
111 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc2,
112 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
113 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x80_acc5,
114 xnn_f32_rmax_ukernel__avx,
115 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80_acc5,
116 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
117
118 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x96,
119 xnn_f32_rmax_ukernel__avx,
120 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96,
121 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
122 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x96_acc2,
123 xnn_f32_rmax_ukernel__avx,
124 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96_acc2,
125 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
126 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x96_acc3,
127 xnn_f32_rmax_ukernel__avx,
128 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96_acc3,
129 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
130 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx2_p5_x96_acc6,
131 xnn_f32_rmax_ukernel__avx,
132 xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96_acc6,
133 benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseRealTime();
134
135 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x128,
136 xnn_f32_rmax_ukernel__avx,
137 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128,
138 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
139 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x128_acc2,
140 xnn_f32_rmax_ukernel__avx,
141 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2,
142 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
143 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x128_acc4,
144 xnn_f32_rmax_ukernel__avx,
145 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc4,
146 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
147
148 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x144,
149 xnn_f32_rmax_ukernel__avx,
150 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x144,
151 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
152 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x144_acc3,
153 xnn_f32_rmax_ukernel__avx,
154 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x144_acc3,
155 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
156
157 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x160,
158 xnn_f32_rmax_ukernel__avx,
159 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x160,
160 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
161 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x160_acc2,
162 xnn_f32_rmax_ukernel__avx,
163 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x160_acc2,
164 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
165 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x160_acc5,
166 xnn_f32_rmax_ukernel__avx,
167 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x160_acc5,
168 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
169
170 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x192,
171 xnn_f32_rmax_ukernel__avx,
172 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x192,
173 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
174 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x192_acc2,
175 xnn_f32_rmax_ukernel__avx,
176 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x192_acc2,
177 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
178 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x192_acc3,
179 xnn_f32_rmax_ukernel__avx,
180 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x192_acc3,
181 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
182 BENCHMARK_CAPTURE(f32_raddexpminusmax, avx512f_p5_scalef_x192_acc6,
183 xnn_f32_rmax_ukernel__avx,
184 xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x192_acc6,
185 benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseRealTime();
186#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
187
188#ifndef XNNPACK_BENCHMARK_NO_MAIN
189BENCHMARK_MAIN();
190#endif