blob: 8476ed76515a5f4afd68c2ae970cd47b6cf83bed [file] [log] [blame]
Frank Barchard5a599a62020-06-04 20:12:44 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cfloat>
8#include <cmath>
9#include <functional>
10#include <random>
11#include <vector>
12
13#include <cpuinfo.h>
14
15#include <benchmark/benchmark.h>
16#include <fp16/fp16.h>
17#include "bench/dwconv.h"
18#include "bench/utils.h"
19#include <xnnpack/AlignedAllocator.h>
20#include <xnnpack/common.h>
21#include <xnnpack/dwconv.h>
22#include <xnnpack/indirection.h>
23#include <xnnpack/operator.h>
24#include <xnnpack/pack.h>
25#include <xnnpack/params-init.h>
26#include <xnnpack/params.h>
27
28
29static void DWConvBenchmark(benchmark::State& state,
30 xnn_f16_dwconv_minmax_unipass_ukernel_function dwconv,
31 uint32_t cr, uint32_t kr,
32 benchmark::utils::IsaCheckFunction isa_check = nullptr)
33{
34 if (!cpuinfo_initialize()) {
35 state.SkipWithError("cpuinfo initialization failed");
36 return;
37 }
38 if (isa_check && !isa_check(state)) {
39 return;
40 }
41
42 const size_t input_height = state.range(0);
43 const size_t input_width = state.range(1);
44 const size_t kernel_height = state.range(2);
45 const size_t kernel_width = state.range(3);
46 const size_t padding_height = state.range(4);
47 const size_t padding_width = state.range(5);
48 const size_t subsampling = state.range(6);
49 const size_t dilation = state.range(7);
50 const size_t channels = state.range(8);
51
52 const size_t kernel_size = kernel_height * kernel_width;
53 if (kernel_size != kr) {
54 state.SkipWithError("kernel size mismatch");
55 return;
56 }
57
58 std::random_device random_device;
59 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -070060 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
Frank Barchard5a599a62020-06-04 20:12:44 -070061 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
62
63 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
64 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
65 const size_t padding_left = padding_width / 2;
66 const size_t padding_top = padding_height / 2;
67 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
68 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
69 const size_t output_size = output_height * output_width;
70 const size_t step_width = dilation == 1 ? subsampling : kernel_width;
71 const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
72
73 const size_t c_stride = benchmark::utils::RoundUp<size_t>(channels, cr);
74
75 std::vector<uint16_t> a(channels * input_height * input_width + XNN_EXTRA_BYTES / sizeof(uint16_t));
76 std::generate(a.begin(), a.end(), std::ref(f16rng));
77 std::vector<uint16_t> k(channels * kernel_height * kernel_width);
78 std::generate(k.begin(), k.end(), std::ref(f16rng));
79 std::vector<uint16_t> b(channels);
80 std::generate(b.begin(), b.end(), std::ref(f16rng));
81
82 std::vector<uint16_t> z(channels + XNN_EXTRA_BYTES / sizeof(uint16_t));
83
84 const size_t w_elements = (kernel_size + 1) * c_stride;
85 const size_t i_elements = output_height * step_height;
86 const size_t c_elements = output_size * channels;
87 const size_t num_buffers = 1 +
88 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
89 sizeof(uint16_t) * (w_elements + c_elements) + sizeof(void*) * i_elements);
90
91 std::vector<uint16_t, AlignedAllocator<uint16_t, 32>> w(w_elements * num_buffers);
92 std::fill(w.begin(), w.end(), 0.0f);
93 xnn_pack_f16_dwconv_ghw_w(kernel_height, kernel_width, channels, cr,
Marat Dukhan82286892021-06-04 17:27:27 -070094 k.data(), b.data(), w.data(), 0 /* extra bytes */, nullptr);
Frank Barchard5a599a62020-06-04 20:12:44 -070095 for (size_t n = 1; n < num_buffers; n++) {
96 std::copy(w.cbegin(), w.cbegin() + w_elements, w.begin() + n * w_elements);
97 }
98
99 std::vector<const uint16_t*> i(i_elements * num_buffers);
100 xnn_operator convolution_op = { };
101 convolution_op.indirection_buffer = reinterpret_cast<const void**>(i.data());
102 convolution_op.input = a.data();
103 convolution_op.input_pixel_stride = channels;
104 convolution_op.zero_buffer = z.data();
Frank Barchard5a599a62020-06-04 20:12:44 -0700105 convolution_op.input_height = input_height;
106 convolution_op.input_width = input_width;
107 convolution_op.output_height = output_height;
108 convolution_op.output_width = output_width;
109 convolution_op.kernel_height = kernel_height;
110 convolution_op.kernel_width = kernel_width;
111 convolution_op.stride_height = subsampling;
112 convolution_op.stride_width = subsampling;
113 convolution_op.dilation_height = dilation;
114 convolution_op.dilation_width = dilation;
115 convolution_op.padding_top = padding_top;
116 convolution_op.padding_left = padding_left;
117
Marat Dukhanc79427c2020-10-15 09:04:21 -0700118 xnn_indirection_init_dwconv2d(&convolution_op, step_height, step_width, 1 /* log2(sizeof(uint16_t)) */);
Frank Barchard5a599a62020-06-04 20:12:44 -0700119 for (size_t n = 1; n < num_buffers; n++) {
120 std::copy(i.cbegin(), i.cbegin() + i_elements, i.begin() + n * i_elements);
121 }
122
123 std::vector<uint16_t> c(c_elements * num_buffers);
124 std::fill(c.begin(), c.end(), std::nanf(""));
125
Marat Dukhanf56f4c42021-05-17 01:47:20 -0700126 xnn_f16_minmax_params params;
127 xnn_init_f16_minmax_params(&params, UINT16_C(0xFC00) /* -inf */, UINT16_C(0x7C00) /* inf */);
Frank Barchard5a599a62020-06-04 20:12:44 -0700128
129 size_t buffer_index = 0;
130 for (auto _ : state) {
131 state.PauseTiming();
132 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
133 buffer_index = (buffer_index + 1) % num_buffers;
134 state.ResumeTiming();
135
Marat Dukhanc79427c2020-10-15 09:04:21 -0700136 for (size_t y = 0; y < output_height; y++) {
Frank Barchard5a599a62020-06-04 20:12:44 -0700137 dwconv(channels, output_width,
138 reinterpret_cast<const void**>(i.data() + buffer_index * i_elements + step_height * y),
139 w.data() + buffer_index * w_elements,
140 c.data() + buffer_index * c_elements + y * output_width * channels,
141 kernel_height * step_width * sizeof(void*), 0,
142 0, z.data(), &params);
143 }
144 }
145
Marat Dukhand713e8a2020-12-04 14:23:12 -0800146 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
147 if (cpu_frequency != 0) {
148 state.counters["cpufreq"] = cpu_frequency;
149 }
Frank Barchard5a599a62020-06-04 20:12:44 -0700150
Marat Dukhand713e8a2020-12-04 14:23:12 -0800151 state.counters["FLOPS"] = benchmark::Counter(
152 uint64_t(state.iterations()) * 2 * output_size * channels * kernel_size, benchmark::Counter::kIsRate);
153
154 state.counters["bytes"] = benchmark::Counter(
Frank Barchard5a599a62020-06-04 20:12:44 -0700155 uint64_t(state.iterations()) * (output_size + input_height * input_width + kernel_size + 1 /* bias */) * channels * sizeof(uint16_t),
156 benchmark::Counter::kIsRate);
157}
158
159#if XNN_ARCH_ARM64
160 static void f16_dwconv_8x25__neonfp16arith_acc2(benchmark::State& state, const char* net) {
161 DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith_acc2, 8, 25,
Marat Dukhand713e8a2020-12-04 14:23:12 -0800162 benchmark::utils::CheckNEONFP16ARITH);
Frank Barchard5a599a62020-06-04 20:12:44 -0700163 }
164
165 static void f16_dwconv_8x25__neonfp16arith(benchmark::State& state, const char* net) {
166 DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x25__neonfp16arith, 8, 25,
Marat Dukhand713e8a2020-12-04 14:23:12 -0800167 benchmark::utils::CheckNEONFP16ARITH);
Frank Barchard5a599a62020-06-04 20:12:44 -0700168 }
169
170 static void f16_dwconv_8x4__neonfp16arith_acc2(benchmark::State& state, const char* net) {
171 DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith_acc2, 8, 4,
Marat Dukhand713e8a2020-12-04 14:23:12 -0800172 benchmark::utils::CheckNEONFP16ARITH);
Frank Barchard5a599a62020-06-04 20:12:44 -0700173 }
174
175 static void f16_dwconv_8x4__neonfp16arith(benchmark::State& state, const char* net) {
176 DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x4__neonfp16arith, 8, 4,
Marat Dukhand713e8a2020-12-04 14:23:12 -0800177 benchmark::utils::CheckNEONFP16ARITH);
Frank Barchard5a599a62020-06-04 20:12:44 -0700178 }
179
180 static void f16_dwconv_8x9__neonfp16arith_acc2(benchmark::State& state, const char* net) {
181 DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith_acc2, 8, 9,
Marat Dukhand713e8a2020-12-04 14:23:12 -0800182 benchmark::utils::CheckNEONFP16ARITH);
Frank Barchard5a599a62020-06-04 20:12:44 -0700183 }
184
185 static void f16_dwconv_8x9__neonfp16arith(benchmark::State& state, const char* net) {
186 DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up8x9__neonfp16arith, 8, 9,
Marat Dukhand713e8a2020-12-04 14:23:12 -0800187 benchmark::utils::CheckNEONFP16ARITH);
Frank Barchard5a599a62020-06-04 20:12:44 -0700188 }
189
190 static void f16_dwconv_16x25__neonfp16arith_acc2(benchmark::State& state, const char* net) {
191 DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith_acc2, 16, 25,
Marat Dukhand713e8a2020-12-04 14:23:12 -0800192 benchmark::utils::CheckNEONFP16ARITH);
Frank Barchard5a599a62020-06-04 20:12:44 -0700193 }
194
195 static void f16_dwconv_16x25__neonfp16arith(benchmark::State& state, const char* net) {
196 DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x25__neonfp16arith, 16, 25,
Marat Dukhand713e8a2020-12-04 14:23:12 -0800197 benchmark::utils::CheckNEONFP16ARITH);
Frank Barchard5a599a62020-06-04 20:12:44 -0700198 }
199
200 static void f16_dwconv_16x4__neonfp16arith_acc2(benchmark::State& state, const char* net) {
201 DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith_acc2, 16, 4,
Marat Dukhand713e8a2020-12-04 14:23:12 -0800202 benchmark::utils::CheckNEONFP16ARITH);
Frank Barchard5a599a62020-06-04 20:12:44 -0700203 }
204
205 static void f16_dwconv_16x4__neonfp16arith(benchmark::State& state, const char* net) {
206 DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x4__neonfp16arith, 16, 4,
Marat Dukhand713e8a2020-12-04 14:23:12 -0800207 benchmark::utils::CheckNEONFP16ARITH);
Frank Barchard5a599a62020-06-04 20:12:44 -0700208 }
209
210 static void f16_dwconv_16x9__neonfp16arith_acc2(benchmark::State& state, const char* net) {
211 DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith_acc2, 16, 9,
Marat Dukhand713e8a2020-12-04 14:23:12 -0800212 benchmark::utils::CheckNEONFP16ARITH);
Frank Barchard5a599a62020-06-04 20:12:44 -0700213 }
214
215 static void f16_dwconv_16x9__neonfp16arith(benchmark::State& state, const char* net) {
216 DWConvBenchmark(state, xnn_f16_dwconv_minmax_ukernel_up16x9__neonfp16arith, 16, 9,
Marat Dukhand713e8a2020-12-04 14:23:12 -0800217 benchmark::utils::CheckNEONFP16ARITH);
Frank Barchard5a599a62020-06-04 20:12:44 -0700218 }
219
220 BENCHMARK_DWCONV(f16_dwconv_8x25__neonfp16arith_acc2)
221 BENCHMARK_DWCONV(f16_dwconv_8x25__neonfp16arith)
222 BENCHMARK_DWCONV(f16_dwconv_8x4__neonfp16arith_acc2)
223 BENCHMARK_DWCONV(f16_dwconv_8x4__neonfp16arith)
224 BENCHMARK_DWCONV(f16_dwconv_8x9__neonfp16arith_acc2)
225 BENCHMARK_DWCONV(f16_dwconv_8x9__neonfp16arith)
226 BENCHMARK_DWCONV(f16_dwconv_16x25__neonfp16arith_acc2)
227 BENCHMARK_DWCONV(f16_dwconv_16x25__neonfp16arith)
228 BENCHMARK_DWCONV(f16_dwconv_16x4__neonfp16arith_acc2)
229 BENCHMARK_DWCONV(f16_dwconv_16x4__neonfp16arith)
230 BENCHMARK_DWCONV(f16_dwconv_16x9__neonfp16arith_acc2)
231 BENCHMARK_DWCONV(f16_dwconv_16x9__neonfp16arith)
232#endif // XNN_ARCH_ARM64
233
234#ifndef XNNPACK_BENCHMARK_NO_MAIN
235BENCHMARK_MAIN();
236#endif