blob: 7b1233db7688d938b93b897b817c551043a19a6f [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cfloat>
8#include <cmath>
9#include <functional>
10#include <random>
11#include <vector>
12
13#include <cpuinfo.h>
14
15#include <benchmark/benchmark.h>
16#include "bench/dwconv.h"
17#include "bench/utils.h"
18#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070019#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070020#include <xnnpack/dwconv.h>
21#include <xnnpack/indirection.h>
22#include <xnnpack/operator.h>
23#include <xnnpack/pack.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070024#include <xnnpack/params-init.h>
25#include <xnnpack/params.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070026
27
Marat Dukhanbf715f92020-10-23 20:17:00 -070028static void DWConv2DBenchmark(benchmark::State& state,
29 xnn_f32_dwconv2d_chw_ukernel_function dwconv,
Marat Dukhanae7e8b22020-10-20 17:51:51 -070030 uint32_t kh, uint32_t kw, uint32_t pw, uint32_t s)
XNNPACK Teamb455b122019-09-27 18:10:33 -070031{
32 if (!cpuinfo_initialize()) {
33 state.SkipWithError("cpuinfo initialization failed");
34 return;
35 }
36
37 const size_t input_height = state.range(0);
38 const size_t input_width = state.range(1);
39 const size_t kernel_height = state.range(2);
40 const size_t kernel_width = state.range(3);
41 const size_t padding_height = state.range(4);
42 const size_t padding_width = state.range(5);
43 const size_t subsampling = state.range(6);
44 const size_t dilation = state.range(7);
45 const size_t channels = state.range(8);
46
47 if (kernel_height != kh) {
48 state.SkipWithError("kernel height mismatch");
49 return;
50 }
51
52 if (kernel_width != kw) {
53 state.SkipWithError("kernel width mismatch");
54 return;
55 }
56
57 if (subsampling != s) {
58 state.SkipWithError("subsampling mismatch");
59 return;
60 }
61
62 if (padding_width % 2 != 0 || padding_width / 2 != pw) {
63 state.SkipWithError("padding width mismatch");
64 return;
65 }
66
67 if (dilation != 1) {
68 state.SkipWithError("unsupported dilation");
69 return;
70 }
71
72 std::random_device random_device;
73 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -070074 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -070075
76 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
77 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
78 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
79 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
80
81 const size_t inputSize = (input_height + padding_height) * input_width;
82 const size_t kernel_size = kernel_height * kernel_width;
83 const size_t output_size = output_height * output_width;
84
Marat Dukhanae7e8b22020-10-20 17:51:51 -070085 std::vector<float> input(inputSize * channels + 2 * XNN_EXTRA_BYTES);
XNNPACK Teamb455b122019-09-27 18:10:33 -070086 std::generate(input.begin(), input.end(), std::ref(f32rng));
87 std::vector<float> bias(channels);
88 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
89 std::vector<float> kernel(channels * kernel_size);
90 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
Erich Elsen4e5db3d2020-05-07 08:57:47 -070091 std::vector<float> zero(input_width + padding_width);
XNNPACK Teamb455b122019-09-27 18:10:33 -070092
93 const size_t w_elements = (kernel_size + 1) * channels;
94 const size_t o_elements = output_size * channels;
95 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070096 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070097 sizeof(float) * (w_elements + o_elements));
98
99 std::vector<float, AlignedAllocator<float, 32>> packed_weights(w_elements * num_buffers);
100 std::fill(packed_weights.begin(), packed_weights.end(), 0.0f);
101 for (size_t c = 0; c < channels; c++) {
102 packed_weights[c * kernel_size + c] = bias[c];
103 for (size_t i = 0; i < kernel_size; i++) {
104 packed_weights[c * kernel_size + c + 1 + i] = kernel[c * kernel_size + i];
105 }
106 }
107 for (size_t n = 1; n < num_buffers; n++) {
108 std::copy(packed_weights.cbegin(), packed_weights.cbegin() + w_elements, packed_weights.begin() + n * w_elements);
109 }
110
111 std::vector<float> output(o_elements * num_buffers);
112 std::fill(output.begin(), output.end(), std::nanf(""));
113
Marat Dukhan1f29b802020-05-15 23:46:39 -0700114 xnn_f32_chw_params chw_params =
115 xnn_init_f32_chw_params(input_width, -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700116
117 size_t buffer_index = 0;
118 for (auto _ : state) {
119 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700120 benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700121 buffer_index = (buffer_index + 1) % num_buffers;
122 state.ResumeTiming();
123
124 for (uint32_t channel = 0; channel < channels; channel++) {
125 dwconv(
Marat Dukhan75157772020-10-21 01:46:28 -0700126 input_height, input_width * sizeof(float),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700127 input.data() + channel * inputSize,
128 packed_weights.data() + channel * (kernel_size + 1) + buffer_index * w_elements,
Erich Elsen4e5db3d2020-05-07 08:57:47 -0700129 zero.data(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700130 output.data() + channel * output_size + buffer_index * o_elements,
Erich Elsen4e5db3d2020-05-07 08:57:47 -0700131 padding_height / 2, // padding_top
Marat Dukhan1f29b802020-05-15 23:46:39 -0700132 &chw_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700133 }
134 }
135
136 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
137 state.counters["FLOPS"] = benchmark::Counter(
138 uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size,
139 benchmark::Counter::kIsRate);
140
141 state.counters["BYTES"] = benchmark::Counter(
142 uint64_t(state.iterations()) * (output_size + inputSize + kernel_size + 1 /* bias */) * channels * sizeof(float),
143 benchmark::Counter::kIsRate);
144}
145
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700146#if XNN_ARCH_ARM64
Marat Dukhanbf715f92020-10-23 20:17:00 -0700147 static void dwconv2d_chw_3x3p1__neonfma_3x4(benchmark::State& state, const char* net) {
148 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__neonfma_3x4, 3, 3, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700149 }
150
Marat Dukhanbf715f92020-10-23 20:17:00 -0700151 static void dwconv2d_chw_3x3s2p1__neonfma_1x4_acc3(benchmark::State& state, const char* net) {
152 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__neonfma_1x4_acc3, 3, 3, 1, 2);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700153 }
154
Marat Dukhanbf715f92020-10-23 20:17:00 -0700155 static void dwconv2d_chw_5x5p2__neonfma_3x4(benchmark::State& state, const char* net) {
156 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__neonfma_3x4, 5, 5, 2, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700157 }
158
Marat Dukhanbf715f92020-10-23 20:17:00 -0700159 static void dwconv2d_chw_5x5s2p2__neonfma_1x4_acc2(benchmark::State& state, const char* net) {
160 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__neonfma_1x4_acc2, 5, 5, 2, 2);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700161 }
162
Marat Dukhanbf715f92020-10-23 20:17:00 -0700163 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__neonfma_3x4)
164 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__neonfma_1x4_acc3)
165 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__neonfma_3x4)
166 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__neonfma_1x4_acc2)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700167#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700168
169
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700170#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhanbf715f92020-10-23 20:17:00 -0700171 static void dwconv2d_chw_3x3p1__sse_1x4_acc3(benchmark::State& state, const char* net) {
172 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__sse_1x4_acc3, 3, 3, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700173 }
174
Marat Dukhanbf715f92020-10-23 20:17:00 -0700175 static void dwconv2d_chw_3x3s2p1__sse_1x4_acc3(benchmark::State& state, const char* net) {
176 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__sse_1x4_acc3, 3, 3, 1, 2);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700177 }
178
Marat Dukhanbf715f92020-10-23 20:17:00 -0700179 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__sse_1x4_acc3)
180 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__sse_1x4_acc3)
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700181#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700182
Marat Dukhana199d492020-07-24 15:01:25 -0700183#if !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
Marat Dukhanbf715f92020-10-23 20:17:00 -0700184 static void dwconv2d_chw_3x3p1__psimd_1x4_acc3(benchmark::State& state, const char* net) {
185 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__psimd_1x4_acc3, 3, 3, 1, 1);
Erich Elsene6214af2020-06-10 22:17:22 -0700186 }
Marat Dukhanbf715f92020-10-23 20:17:00 -0700187 static void dwconv2d_chw_3x3s2p1__psimd_1x4_acc3(benchmark::State& state, const char* net) {
188 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__psimd_1x4_acc3, 3, 3, 1, 2);
Erich Elsenfd7a6e32020-06-11 12:04:44 -0700189 }
Marat Dukhanbf715f92020-10-23 20:17:00 -0700190 static void dwconv2d_chw_5x5p2__psimd_3x4(benchmark::State& state, const char* net) {
191 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__psimd_3x4, 5, 5, 2, 1);
Erich Elsen28928892020-06-12 08:08:19 -0700192 }
Marat Dukhanbf715f92020-10-23 20:17:00 -0700193 static void dwconv2d_chw_5x5s2p2__psimd_1x4_acc2(benchmark::State& state, const char* net) {
194 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__psimd_1x4_acc2, 5, 5, 2, 2);
Erich Elsen7465a892020-06-13 14:02:04 -0700195 }
196
Marat Dukhanbf715f92020-10-23 20:17:00 -0700197 BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__psimd_1x4_acc3)
198 BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__psimd_1x4_acc3)
199 BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__psimd_3x4)
200 BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__psimd_1x4_acc2)
Marat Dukhana199d492020-07-24 15:01:25 -0700201#endif // !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
Erich Elsene6214af2020-06-10 22:17:22 -0700202
Marat Dukhanbf715f92020-10-23 20:17:00 -0700203static void dwconv2d_chw_3x3p1__scalar_1x1_acc3(benchmark::State& state, const char* net) {
204 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_1x1_acc3, 3, 3, 1, 1);
Marat Dukhanae7e8b22020-10-20 17:51:51 -0700205}
Erich Elsen0cc2c532019-10-15 04:44:18 -0700206
Marat Dukhanbf715f92020-10-23 20:17:00 -0700207static void dwconv2d_chw_3x3s2p1__scalar_1x1_acc3(benchmark::State& state, const char* net) {
208 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_3x3s2p1__scalar_1x1_acc3, 3, 3, 1, 2);
Marat Dukhanae7e8b22020-10-20 17:51:51 -0700209}
Erich Elsen38709a62019-11-08 11:58:45 -0800210
Marat Dukhanbf715f92020-10-23 20:17:00 -0700211static void dwconv2d_chw_5x5p2__scalar_1x1_acc5(benchmark::State& state, const char* net) {
212 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_5x5p2__scalar_1x1_acc5, 5, 5, 2, 1);
Marat Dukhanae7e8b22020-10-20 17:51:51 -0700213}
Erich Elsenac4de802019-10-16 04:35:30 -0700214
Marat Dukhanbf715f92020-10-23 20:17:00 -0700215static void dwconv2d_chw_5x5s2p2__scalar_1x1_acc5(benchmark::State& state, const char* net) {
216 DWConv2DBenchmark(state, xnn_f32_dwconv2d_chw_ukernel_5x5s2p2__scalar_1x1_acc5, 5, 5, 2, 2);
Marat Dukhanae7e8b22020-10-20 17:51:51 -0700217}
Erich Elsen38709a62019-11-08 11:58:45 -0800218
Marat Dukhanbf715f92020-10-23 20:17:00 -0700219BENCHMARK_DWCONV(dwconv2d_chw_3x3p1__scalar_1x1_acc3)
220BENCHMARK_DWCONV(dwconv2d_chw_3x3s2p1__scalar_1x1_acc3)
221BENCHMARK_DWCONV(dwconv2d_chw_5x5p2__scalar_1x1_acc5)
222BENCHMARK_DWCONV(dwconv2d_chw_5x5s2p2__scalar_1x1_acc5)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700223
224#ifndef XNNPACK_BENCHMARK_NO_MAIN
225BENCHMARK_MAIN();
226#endif