blob: b6408fd63a3f9954a5052c9c5b96193b27c2abaf [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cfloat>
8#include <cmath>
9#include <functional>
10#include <random>
11#include <vector>
12
13#include <cpuinfo.h>
14
15#include <benchmark/benchmark.h>
16#include "bench/dwconv.h"
17#include "bench/utils.h"
18#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070019#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070020#include <xnnpack/dwconv.h>
21#include <xnnpack/indirection.h>
22#include <xnnpack/operator.h>
23#include <xnnpack/pack.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070024#include <xnnpack/params-init.h>
25#include <xnnpack/params.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070026
27
28static void DWConvCHWBenchmark(benchmark::State& state,
29 xnn_f32_dwconv_spchw_ukernel_function dwconv,
30 uint32_t it, uint32_t ot, uint32_t kh, uint32_t kw, uint32_t pw, uint32_t s)
31{
32 if (!cpuinfo_initialize()) {
33 state.SkipWithError("cpuinfo initialization failed");
34 return;
35 }
36
37 const size_t input_height = state.range(0);
38 const size_t input_width = state.range(1);
39 const size_t kernel_height = state.range(2);
40 const size_t kernel_width = state.range(3);
41 const size_t padding_height = state.range(4);
42 const size_t padding_width = state.range(5);
43 const size_t subsampling = state.range(6);
44 const size_t dilation = state.range(7);
45 const size_t channels = state.range(8);
46
47 if (kernel_height != kh) {
48 state.SkipWithError("kernel height mismatch");
49 return;
50 }
51
52 if (kernel_width != kw) {
53 state.SkipWithError("kernel width mismatch");
54 return;
55 }
56
57 if (subsampling != s) {
58 state.SkipWithError("subsampling mismatch");
59 return;
60 }
61
62 if (padding_width % 2 != 0 || padding_width / 2 != pw) {
63 state.SkipWithError("padding width mismatch");
64 return;
65 }
66
67 if (dilation != 1) {
68 state.SkipWithError("unsupported dilation");
69 return;
70 }
71
72 std::random_device random_device;
73 auto rng = std::mt19937(random_device());
74 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
75
76 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
77 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
78 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
79 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
80
81 const size_t inputSize = (input_height + padding_height) * input_width;
82 const size_t kernel_size = kernel_height * kernel_width;
83 const size_t output_size = output_height * output_width;
84
85 std::vector<float> input(inputSize * channels + 2 * it);
86 std::generate(input.begin(), input.end(), std::ref(f32rng));
87 std::vector<float> bias(channels);
88 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
89 std::vector<float> kernel(channels * kernel_size);
90 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
91
92 const size_t w_elements = (kernel_size + 1) * channels;
93 const size_t o_elements = output_size * channels;
94 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070095 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070096 sizeof(float) * (w_elements + o_elements));
97
98 std::vector<float, AlignedAllocator<float, 32>> packed_weights(w_elements * num_buffers);
99 std::fill(packed_weights.begin(), packed_weights.end(), 0.0f);
100 for (size_t c = 0; c < channels; c++) {
101 packed_weights[c * kernel_size + c] = bias[c];
102 for (size_t i = 0; i < kernel_size; i++) {
103 packed_weights[c * kernel_size + c + 1 + i] = kernel[c * kernel_size + i];
104 }
105 }
106 for (size_t n = 1; n < num_buffers; n++) {
107 std::copy(packed_weights.cbegin(), packed_weights.cbegin() + w_elements, packed_weights.begin() + n * w_elements);
108 }
109
110 std::vector<float> output(o_elements * num_buffers);
111 std::fill(output.begin(), output.end(), std::nanf(""));
112
113 xnn_f32_spchw_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700114 xnn_init_f32_spchw_params(input_width, -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700115
116 size_t buffer_index = 0;
117 for (auto _ : state) {
118 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700119 benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700120 buffer_index = (buffer_index + 1) % num_buffers;
121 state.ResumeTiming();
122
123 for (uint32_t channel = 0; channel < channels; channel++) {
124 dwconv(
125 output_height, input_width,
126 input.data() + channel * inputSize,
127 packed_weights.data() + channel * (kernel_size + 1) + buffer_index * w_elements,
128 output.data() + channel * output_size + buffer_index * o_elements,
129 it * sizeof(float), ot * sizeof(float),
130 input_width * sizeof(float), output_width * sizeof(float),
131 &output_params);
132 }
133 }
134
135 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
136 state.counters["FLOPS"] = benchmark::Counter(
137 uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
138 benchmark::Counter::kIsRate);
139
140 state.counters["BYTES"] = benchmark::Counter(
141 uint64_t(state.iterations()) * (output_size + inputSize + kernel_size + 1 /* bias */) * channels * sizeof(float),
142 benchmark::Counter::kIsRate);
143}
144
145static void DWConvHWoTCTBenchmark(benchmark::State& state,
146 xnn_f32_dwconv_spchw_ukernel_function dwconv,
147 uint32_t it, uint32_t ot, uint32_t kh, uint32_t kw, uint32_t pw, uint32_t s)
148{
149 if (!cpuinfo_initialize()) {
150 state.SkipWithError("cpuinfo initialization failed");
151 return;
152 }
153
154 const size_t input_height = state.range(0);
155 const size_t input_width = state.range(1);
156 const size_t kernel_height = state.range(2);
157 const size_t kernel_width = state.range(3);
158 const size_t padding_height = state.range(4);
159 const size_t padding_width = state.range(5);
160 const size_t subsampling = state.range(6);
161 const size_t dilation = state.range(7);
162 const size_t channels = state.range(8);
163
164 if (kernel_height != kh) {
165 state.SkipWithError("kernel height mismatch");
166 return;
167 }
168
169 if (kernel_width != kw) {
170 state.SkipWithError("kernel width mismatch");
171 return;
172 }
173
174 if (subsampling != s) {
175 state.SkipWithError("subsampling mismatch");
176 return;
177 }
178
179 if (padding_width % 2 != 0 || padding_width / 2 != pw) {
180 state.SkipWithError("padding width mismatch");
181 return;
182 }
183
184 if (dilation != 1) {
185 state.SkipWithError("unsupported dilation");
186 return;
187 }
188
189 std::random_device random_device;
190 auto rng = std::mt19937(random_device());
191 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
192
193 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
194 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
195 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
196 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
197
198 const size_t inputSize = (input_height + padding_height) * input_width;
199 const size_t kernel_size = kernel_height * kernel_width;
200 const size_t output_size = output_height * output_width;
201
Marat Dukhan42323232019-10-23 02:09:02 -0700202 std::vector<float> input(input_height * benchmark::utils::RoundUp<size_t>(input_width, it) * channels);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700203 std::generate(input.begin(), input.end(), std::ref(f32rng));
204 std::vector<float> bias(channels);
205 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
206 std::vector<float> kernel(channels * kernel_size);
207 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
208
209 const size_t w_elements = (kernel_size + 1) * channels;
Marat Dukhan42323232019-10-23 02:09:02 -0700210 const size_t o_elements = output_height * benchmark::utils::RoundUp<size_t>(output_width, ot) * channels;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700211 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700212 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700213 sizeof(float) * (w_elements + o_elements));
214
215 std::vector<float, AlignedAllocator<float, 32>> packed_weights(w_elements * num_buffers);
216 std::fill(packed_weights.begin(), packed_weights.end(), 0.0f);
217 for (size_t c = 0; c < channels; c++) {
218 packed_weights[c * kernel_size + c] = bias[c];
219 for (size_t i = 0; i < kernel_size; i++) {
220 packed_weights[c * kernel_size + c + 1 + i] = kernel[c * kernel_size + i];
221 }
222 }
223 for (size_t n = 1; n < num_buffers; n++) {
224 std::copy(packed_weights.cbegin(), packed_weights.cbegin() + w_elements, packed_weights.begin() + n * w_elements);
225 }
226
227 std::vector<float> output(o_elements * num_buffers);
228 std::fill(output.begin(), output.end(), std::nanf(""));
229
230 xnn_f32_spchw_params output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700231 xnn_init_f32_spchw_params(input_width, -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700232
233 size_t buffer_index = 0;
234 for (auto _ : state) {
235 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700236 benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700237 buffer_index = (buffer_index + 1) % num_buffers;
238 state.ResumeTiming();
239
240 for (uint32_t channel = 0; channel < channels; channel++) {
241 dwconv(
242 output_height, input_width,
243 input.data() + channel * it,
244 packed_weights.data() + channel * (kernel_size + 1) + buffer_index * w_elements,
245 output.data() + channel * ot + buffer_index * o_elements,
246 it * channels * sizeof(float), ot * channels * sizeof(float),
Marat Dukhan42323232019-10-23 02:09:02 -0700247 benchmark::utils::RoundUp<size_t>(input_width, it) * channels * sizeof(float),
248 benchmark::utils::RoundUp<size_t>(output_width, ot) * channels * sizeof(float),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700249 &output_params);
250 }
251 }
252
253 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
254 state.counters["FLOPS"] = benchmark::Counter(
255 uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
256 benchmark::Counter::kIsRate);
257
258 state.counters["BYTES"] = benchmark::Counter(
259 uint64_t(state.iterations()) * (output_size + inputSize + kernel_size + 1 /* bias */) * channels * sizeof(float),
260 benchmark::Counter::kIsRate);
261}
262
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700263#if XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700264 static void CHW_3x3p1__neonfma(benchmark::State& state, const char* net) {
265 DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma, 4, 4, 3, 3, 1, 1);
266 }
267
268 static void CHW_5x5p2__neonfma(benchmark::State& state, const char* net) {
269 DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma, 4, 4, 5, 5, 2, 1);
270 }
271
272 static void CHW_3x3s2p1__neonfma(benchmark::State& state, const char* net) {
273 DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma, 4, 4, 3, 3, 1, 2);
274 }
275
276 static void CHW_5x5s2p2__neonfma(benchmark::State& state, const char* net) {
277 DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma, 4, 4, 5, 5, 2, 2);
278 }
279
280 static void HWo4C4_3x3p1__neonfma(benchmark::State& state, const char* net) {
281 DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__neonfma, 4, 4, 3, 3, 1, 1);
282 }
283
284 static void HWo4C4_5x5p2__neonfma(benchmark::State& state, const char* net) {
285 DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5p2__neonfma, 4, 4, 5, 5, 2, 1);
286 }
287
288 static void HWo4C4_3x3s2p1__neonfma(benchmark::State& state, const char* net) {
289 DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3s2p1__neonfma, 4, 4, 3, 3, 1, 2);
290 }
291
292 static void HWo4C4_5x5s2p2__neonfma(benchmark::State& state, const char* net) {
293 DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5s2p2__neonfma, 4, 4, 5, 5, 2, 2);
294 }
295
296 BENCHMARK_DWCONV(CHW_3x3p1__neonfma)
297 BENCHMARK_DWCONV(CHW_5x5p2__neonfma)
298 BENCHMARK_DWCONV(CHW_3x3s2p1__neonfma)
299 BENCHMARK_DWCONV(CHW_5x5s2p2__neonfma)
300 BENCHMARK_DWCONV(HWo4C4_3x3p1__neonfma)
301 BENCHMARK_DWCONV(HWo4C4_5x5p2__neonfma)
302 BENCHMARK_DWCONV(HWo4C4_3x3s2p1__neonfma)
303 BENCHMARK_DWCONV(HWo4C4_5x5s2p2__neonfma)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700304#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700305
306
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700307#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700308 static void CHW_3x3p1__sse(benchmark::State& state, const char* net) {
309 DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__sse, 4, 4, 3, 3, 1, 1);
310 }
311
312 static void CHW_3x3s2p1__sse(benchmark::State& state, const char* net) {
313 DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse, 4, 4, 3, 3, 1, 2);
314 }
315
316 static void HWo4C4_3x3p1__sse(benchmark::State& state, const char* net) {
317 DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__sse, 4, 4, 3, 3, 1, 1);
318 }
319
320 static void HWo4C4_3x3s2p1__sse(benchmark::State& state, const char* net) {
321 DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3s2p1__sse, 4, 4, 3, 3, 1, 2);
322 }
323
324 BENCHMARK_DWCONV(CHW_3x3p1__sse)
325 BENCHMARK_DWCONV(CHW_3x3s2p1__sse)
326 BENCHMARK_DWCONV(HWo4C4_3x3p1__sse)
327 BENCHMARK_DWCONV(HWo4C4_3x3s2p1__sse)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700328#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700329
Erich Elsen0cc2c532019-10-15 04:44:18 -0700330 static void CHW_3x3p1__scalar(benchmark::State& state, const char* net) {
331 DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar, 1, 1, 3, 3, 1, 1);
332 }
333
Erich Elsen38709a62019-11-08 11:58:45 -0800334 static void CHW_5x5p2__scalar(benchmark::State& state, const char* net) {
335 DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, 1, 1, 5, 5, 2, 1);
336 }
337
Erich Elsenac4de802019-10-16 04:35:30 -0700338 static void CHW_3x3s2p1__scalar(benchmark::State& state, const char* net) {
339 DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, 1, 1, 3, 3, 1, 2);
340 }
341
Erich Elsen38709a62019-11-08 11:58:45 -0800342 static void CHW_5x5s2p2__scalar(benchmark::State& state, const char* net) {
343 DWConvCHWBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, 1, 1, 5, 5, 2, 2);
344 }
345
Erich Elsenac4de802019-10-16 04:35:30 -0700346 static void HWC_3x3p1__scalar(benchmark::State& state, const char* net) {
Erich Elsen0cc2c532019-10-15 04:44:18 -0700347 DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3p1__scalar, 1, 1, 3, 3, 1, 1);
348 }
349
Erich Elsen38709a62019-11-08 11:58:45 -0800350 static void HWC_5x5p2__scalar(benchmark::State& state, const char* net) {
351 DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5p2__scalar, 1, 1, 5, 5, 2, 1);
352 }
353
Erich Elsenac4de802019-10-16 04:35:30 -0700354 static void HWC_3x3s2p1__scalar(benchmark::State& state, const char* net) {
355 DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_3x3s2p1__scalar, 1, 1, 3, 3, 1, 2);
356 }
357
Erich Elsen38709a62019-11-08 11:58:45 -0800358 static void HWC_5x5s2p2__scalar(benchmark::State& state, const char* net) {
359 DWConvHWoTCTBenchmark(state, xnn_f32_dwconv_spchw_ukernel_5x5s2p2__scalar, 1, 1, 5, 5, 2, 2);
360 }
361
Erich Elsen0cc2c532019-10-15 04:44:18 -0700362
363 BENCHMARK_DWCONV(CHW_3x3p1__scalar)
Erich Elsen38709a62019-11-08 11:58:45 -0800364 BENCHMARK_DWCONV(CHW_5x5p2__scalar)
Erich Elsenac4de802019-10-16 04:35:30 -0700365 BENCHMARK_DWCONV(CHW_3x3s2p1__scalar)
Erich Elsen38709a62019-11-08 11:58:45 -0800366 BENCHMARK_DWCONV(CHW_5x5s2p2__scalar)
Erich Elsenac4de802019-10-16 04:35:30 -0700367 BENCHMARK_DWCONV(HWC_3x3p1__scalar)
Erich Elsen38709a62019-11-08 11:58:45 -0800368 BENCHMARK_DWCONV(HWC_5x5p2__scalar)
Erich Elsenac4de802019-10-16 04:35:30 -0700369 BENCHMARK_DWCONV(HWC_3x3s2p1__scalar)
Erich Elsen38709a62019-11-08 11:58:45 -0800370 BENCHMARK_DWCONV(HWC_5x5s2p2__scalar)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700371
372#ifndef XNNPACK_BENCHMARK_NO_MAIN
373BENCHMARK_MAIN();
374#endif