Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 1 | #include <algorithm> |
| 2 | #include <cfloat> |
Marat Dukhan | 4fa0fbe | 2019-10-31 10:23:46 -0700 | [diff] [blame] | 3 | #include <chrono> |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 4 | #include <cmath> |
| 5 | #include <functional> |
| 6 | #include <random> |
| 7 | #include <vector> |
| 8 | |
| 9 | #include "bench/utils.h" |
| 10 | #include <xnnpack/common.h> |
| 11 | #include <xnnpack/params.h> |
| 12 | #include <xnnpack/raddexpminusmax.h> |
Marat Dukhan | 4a2bbc6 | 2019-10-25 17:36:32 -0700 | [diff] [blame] | 13 | #include <xnnpack/raddextexp.h> |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 14 | #include <xnnpack/raddstoreexpminusmax.h> |
| 15 | #include <xnnpack/rmax.h> |
Marat Dukhan | 05ac8e3 | 2019-10-21 15:39:33 -0700 | [diff] [blame] | 16 | #include <xnnpack/vscale.h> |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 17 | #include <xnnpack/vscaleexpminusmax.h> |
Marat Dukhan | 4a2bbc6 | 2019-10-25 17:36:32 -0700 | [diff] [blame] | 18 | #include <xnnpack/vscaleextexp.h> |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 19 | |
| 20 | #include <benchmark/benchmark.h> |
| 21 | |
| 22 | |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 23 | static void ThreePassSoftMaxWithRecomputing( |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 24 | benchmark::State& state, |
| 25 | xnn_f32_rmax_ukernel_function rmax, |
| 26 | xnn_f32_raddexpminusmax_ukernel_function raddexpminusmax, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 27 | xnn_f32_vscaleexpminusmax_ukernel_function vscaleexpminusmax, |
| 28 | benchmark::utils::IsaCheckFunction isa_check = nullptr) |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 29 | { |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 30 | if (isa_check && !isa_check(state)) { |
| 31 | return; |
| 32 | } |
| 33 | |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 34 | const size_t n = state.range(0); |
| 35 | const size_t cache_line_size_max = 128; |
Marat Dukhan | 4232323 | 2019-10-23 02:09:02 -0700 | [diff] [blame] | 36 | const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float)); |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 37 | |
| 38 | std::random_device random_device; |
| 39 | auto rng = std::mt19937(random_device()); |
| 40 | auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), rng); |
| 41 | |
| 42 | const size_t num_buffers = 1 + |
Marat Dukhan | 4232323 | 2019-10-23 02:09:02 -0700 | [diff] [blame] | 43 | benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float)); |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 44 | std::vector<float> x(n); |
| 45 | std::vector<float> y(packed_n * num_buffers); |
| 46 | |
| 47 | std::generate(x.begin(), x.end(), std::ref(f32rng)); |
| 48 | |
| 49 | benchmark::utils::DisableDenormals(); |
| 50 | |
| 51 | size_t buffer_index = 0; |
| 52 | for (auto _ : state) { |
Marat Dukhan | 4232323 | 2019-10-23 02:09:02 -0700 | [diff] [blame] | 53 | benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float)); |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 54 | if (++buffer_index == num_buffers) { |
| 55 | buffer_index = 0; |
| 56 | } |
| 57 | |
| 58 | const auto start = std::chrono::high_resolution_clock::now(); |
| 59 | float x_max = nanf(""); |
| 60 | rmax(n * sizeof(float), x.data(), &x_max); |
| 61 | float y_sum = nanf(""); |
| 62 | raddexpminusmax(n * sizeof(float), x.data(), &y_sum, x_max); |
| 63 | vscaleexpminusmax(n * sizeof(float), x.data(), y.data() + packed_n * buffer_index, x_max, 1.0f / y_sum); |
| 64 | const auto end = std::chrono::high_resolution_clock::now(); |
| 65 | |
| 66 | const auto elapsed_seconds = |
| 67 | std::chrono::duration_cast<std::chrono::duration<double>>(end - start); |
| 68 | state.SetIterationTime(elapsed_seconds.count()); |
| 69 | } |
| 70 | |
Marat Dukhan | 4a2bbc6 | 2019-10-25 17:36:32 -0700 | [diff] [blame] | 71 | state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency(); |
| 72 | state.counters["elements"] = |
| 73 | benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate); |
| 74 | state.counters["bytes"] = |
| 75 | benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate); |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 76 | } |
| 77 | |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 78 | static void ThreePassSoftMaxWithReloading( |
Marat Dukhan | 05ac8e3 | 2019-10-21 15:39:33 -0700 | [diff] [blame] | 79 | benchmark::State& state, |
| 80 | xnn_f32_rmax_ukernel_function rmax, |
| 81 | xnn_f32_raddstoreexpminusmax_ukernel_function raddstoreexpminusmax, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 82 | xnn_f32_vscale_ukernel_function vscale, |
| 83 | benchmark::utils::IsaCheckFunction isa_check = nullptr) |
Marat Dukhan | 05ac8e3 | 2019-10-21 15:39:33 -0700 | [diff] [blame] | 84 | { |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 85 | if (isa_check && !isa_check(state)) { |
| 86 | return; |
| 87 | } |
| 88 | |
Marat Dukhan | 05ac8e3 | 2019-10-21 15:39:33 -0700 | [diff] [blame] | 89 | const size_t n = state.range(0); |
| 90 | const size_t cache_line_size_max = 128; |
Marat Dukhan | 4232323 | 2019-10-23 02:09:02 -0700 | [diff] [blame] | 91 | const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float)); |
Marat Dukhan | 05ac8e3 | 2019-10-21 15:39:33 -0700 | [diff] [blame] | 92 | |
| 93 | std::random_device random_device; |
| 94 | auto rng = std::mt19937(random_device()); |
| 95 | auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), rng); |
| 96 | |
| 97 | const size_t num_buffers = 1 + |
Marat Dukhan | 4232323 | 2019-10-23 02:09:02 -0700 | [diff] [blame] | 98 | benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float)); |
Marat Dukhan | 05ac8e3 | 2019-10-21 15:39:33 -0700 | [diff] [blame] | 99 | std::vector<float> x(n); |
| 100 | std::vector<float> y(packed_n * num_buffers); |
| 101 | |
| 102 | std::generate(x.begin(), x.end(), std::ref(f32rng)); |
| 103 | |
| 104 | benchmark::utils::DisableDenormals(); |
| 105 | |
| 106 | size_t buffer_index = 0; |
| 107 | for (auto _ : state) { |
Marat Dukhan | 4232323 | 2019-10-23 02:09:02 -0700 | [diff] [blame] | 108 | benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float)); |
Marat Dukhan | 05ac8e3 | 2019-10-21 15:39:33 -0700 | [diff] [blame] | 109 | if (++buffer_index == num_buffers) { |
| 110 | buffer_index = 0; |
| 111 | } |
| 112 | |
| 113 | const auto start = std::chrono::high_resolution_clock::now(); |
| 114 | float x_max = nanf(""); |
| 115 | rmax(n * sizeof(float), x.data(), &x_max); |
| 116 | float y_sum = nanf(""); |
| 117 | raddstoreexpminusmax(n * sizeof(float), x.data(), y.data() + packed_n * buffer_index, &y_sum, x_max); |
| 118 | vscale(n * sizeof(float), y.data() + packed_n * buffer_index, y.data() + packed_n * buffer_index, 1.0f / y_sum); |
| 119 | const auto end = std::chrono::high_resolution_clock::now(); |
| 120 | |
| 121 | const auto elapsed_seconds = |
| 122 | std::chrono::duration_cast<std::chrono::duration<double>>(end - start); |
| 123 | state.SetIterationTime(elapsed_seconds.count()); |
| 124 | } |
| 125 | |
Marat Dukhan | 4a2bbc6 | 2019-10-25 17:36:32 -0700 | [diff] [blame] | 126 | state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency(); |
| 127 | state.counters["elements"] = |
| 128 | benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate); |
| 129 | state.counters["bytes"] = |
| 130 | benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate); |
| 131 | } |
| 132 | |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 133 | static void TwoPassSoftMax( |
Marat Dukhan | 4a2bbc6 | 2019-10-25 17:36:32 -0700 | [diff] [blame] | 134 | benchmark::State& state, |
| 135 | xnn_f32_raddextexp_ukernel_function raddextexp, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 136 | xnn_f32_vscaleextexp_ukernel_function vscaleextexp, |
| 137 | benchmark::utils::IsaCheckFunction isa_check = nullptr) |
Marat Dukhan | 4a2bbc6 | 2019-10-25 17:36:32 -0700 | [diff] [blame] | 138 | { |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 139 | if (isa_check && !isa_check(state)) { |
| 140 | return; |
| 141 | } |
| 142 | |
Marat Dukhan | 4a2bbc6 | 2019-10-25 17:36:32 -0700 | [diff] [blame] | 143 | const size_t n = state.range(0); |
| 144 | const size_t cache_line_size_max = 128; |
| 145 | const size_t packed_n = benchmark::utils::RoundUp(n, cache_line_size_max / sizeof(float)); |
| 146 | |
| 147 | std::random_device random_device; |
| 148 | auto rng = std::mt19937(random_device()); |
| 149 | auto f32rng = std::bind(std::uniform_real_distribution<float>(-1000.0f, 1000.0f), rng); |
| 150 | |
| 151 | const size_t num_buffers = 1 + |
| 152 | benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(), packed_n * sizeof(float)); |
| 153 | std::vector<float> x(n); |
| 154 | std::vector<float> y(packed_n * num_buffers); |
| 155 | |
| 156 | std::generate(x.begin(), x.end(), std::ref(f32rng)); |
| 157 | |
| 158 | benchmark::utils::DisableDenormals(); |
| 159 | |
| 160 | size_t buffer_index = 0; |
| 161 | for (auto _ : state) { |
| 162 | benchmark::utils::PrefetchToL1(x.data(), x.size() * sizeof(float)); |
| 163 | if (++buffer_index == num_buffers) { |
| 164 | buffer_index = 0; |
| 165 | } |
| 166 | |
| 167 | const auto start = std::chrono::high_resolution_clock::now(); |
| 168 | float scale[2]; |
| 169 | raddextexp(n * sizeof(float), x.data(), scale); |
| 170 | vscaleextexp(n * sizeof(float), x.data(), y.data() + packed_n * buffer_index, 1.0f / scale[0], -scale[1]); |
| 171 | const auto end = std::chrono::high_resolution_clock::now(); |
| 172 | |
| 173 | const auto elapsed_seconds = |
| 174 | std::chrono::duration_cast<std::chrono::duration<double>>(end - start); |
| 175 | state.SetIterationTime(elapsed_seconds.count()); |
| 176 | } |
| 177 | |
| 178 | state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency(); |
| 179 | state.counters["elements"] = |
| 180 | benchmark::Counter(uint64_t(state.iterations()) * n, benchmark::Counter::kIsRate); |
| 181 | state.counters["bytes"] = |
| 182 | benchmark::Counter(uint64_t(state.iterations()) * 2 * sizeof(float) * n, benchmark::Counter::kIsRate); |
Marat Dukhan | 05ac8e3 | 2019-10-21 15:39:33 -0700 | [diff] [blame] | 183 | } |
| 184 | |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 185 | static void CharacteristicArguments(benchmark::internal::Benchmark* b) { |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 186 | for (int32_t n = 1000; n <= 100000000; n *= 10) { |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 187 | b->Arg(n); |
| 188 | b->Arg(3 * n); |
| 189 | } |
| 190 | } |
| 191 | |
| 192 | #if XNN_ARCH_X86 || XNN_ARCH_X86_64 |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 193 | // Parameters auto-tuned for a mix |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 194 | BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_blend, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 195 | xnn_f32_raddextexp_ukernel__avx2_p5_x96, |
| 196 | xnn_f32_vscaleextexp_ukernel__avx2_p5_x40, |
| 197 | benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 198 | BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx2_blend, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 199 | xnn_f32_rmax_ukernel__avx, |
| 200 | xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96, |
| 201 | xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24, |
| 202 | benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 203 | BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_blend, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 204 | xnn_f32_rmax_ukernel__avx, |
| 205 | xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64_acc2, |
| 206 | xnn_f32_vscale_ukernel__avx_unroll32, |
| 207 | benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 208 | |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 209 | // Parameters auto-tuned for Broadwell |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 210 | BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_broadwell, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 211 | xnn_f32_raddextexp_ukernel__avx2_p5_x96, |
| 212 | xnn_f32_vscaleextexp_ukernel__avx2_p5_x32, |
| 213 | benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 214 | BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx2_broadwell, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 215 | xnn_f32_rmax_ukernel__avx, |
| 216 | xnn_f32_raddexpminusmax_ukernel__avx2_p5_x96, |
| 217 | xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24, |
| 218 | benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 219 | BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_broadwell, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 220 | xnn_f32_rmax_ukernel__avx, |
| 221 | xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64, |
| 222 | xnn_f32_vscale_ukernel__avx_unroll32, |
| 223 | benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); |
| 224 | |
| 225 | // Parameters auto-tuned for Zen 2 |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 226 | BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_zen2, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 227 | xnn_f32_raddextexp_ukernel__avx2_p5_x72, |
| 228 | xnn_f32_vscaleextexp_ukernel__avx2_p5_x40, |
| 229 | benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 230 | BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx2_zen2, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 231 | xnn_f32_rmax_ukernel__avx, |
| 232 | xnn_f32_raddexpminusmax_ukernel__avx2_p5_x80, |
| 233 | xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x16, |
| 234 | benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 235 | BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_zen2, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 236 | xnn_f32_rmax_ukernel__avx, |
| 237 | xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x64, |
| 238 | xnn_f32_vscale_ukernel__avx_unroll32, |
| 239 | benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); |
| 240 | |
| 241 | // Parameters auto-tuned for Skylake |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 242 | BENCHMARK_CAPTURE(TwoPassSoftMax, avx2_skylake, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 243 | xnn_f32_raddextexp_ukernel__avx2_p5_x64, |
| 244 | xnn_f32_vscaleextexp_ukernel__avx2_p5_x40, |
| 245 | benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 246 | BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx2_skylake, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 247 | xnn_f32_rmax_ukernel__avx, |
| 248 | xnn_f32_raddexpminusmax_ukernel__avx2_p5_x64_acc2, |
| 249 | xnn_f32_vscaleexpminusmax_ukernel__avx2_p5_x24, |
| 250 | benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 251 | BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx2_skylake, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 252 | xnn_f32_rmax_ukernel__avx, |
| 253 | xnn_f32_raddstoreexpminusmax_ukernel__avx2_p5_x80_acc2, |
| 254 | xnn_f32_vscale_ukernel__avx_unroll32, |
| 255 | benchmark::utils::CheckAVX2)->Apply(CharacteristicArguments)->UseManualTime(); |
| 256 | |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 257 | BENCHMARK_CAPTURE(TwoPassSoftMax, avx512f_skylake, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 258 | xnn_f32_raddextexp_ukernel__avx512f_p5_scalef_x144_acc3, |
| 259 | xnn_f32_vscaleextexp_ukernel__avx512f_p5_scalef_x16, |
| 260 | benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime(); |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 261 | BENCHMARK_CAPTURE(ThreePassSoftMaxWithRecomputing, avx512f_skylake, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 262 | xnn_f32_rmax_ukernel__avx512f, |
| 263 | xnn_f32_raddexpminusmax_ukernel__avx512f_p5_scalef_x128_acc4, |
| 264 | xnn_f32_vscaleexpminusmax_ukernel__avx512f_p5_scalef_x16, |
| 265 | benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime(); |
Marat Dukhan | fd8e689 | 2020-01-27 15:25:25 -0800 | [diff] [blame] | 266 | BENCHMARK_CAPTURE(ThreePassSoftMaxWithReloading, avx512f_skylake, |
Marat Dukhan | 4c4eb00 | 2019-12-08 21:27:49 -0800 | [diff] [blame] | 267 | xnn_f32_rmax_ukernel__avx512f, |
| 268 | xnn_f32_raddstoreexpminusmax_ukernel__avx512f_p5_scalef_x128_acc2, |
| 269 | xnn_f32_vscale_ukernel__avx512f_unroll64, |
| 270 | benchmark::utils::CheckAVX512F)->Apply(CharacteristicArguments)->UseManualTime(); |
Marat Dukhan | 4a4a7fa | 2019-10-21 13:46:14 -0700 | [diff] [blame] | 271 | #endif // XNN_ARCH_X86 || XNN_ARCH_X86_64 |
| 272 | |
| 273 | #ifndef XNNPACK_BENCHMARK_NO_MAIN |
| 274 | BENCHMARK_MAIN(); |
| 275 | #endif |