blob: 5381a0d4e1e10739ecfa59100ad31c746b2fa273 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <cmath>
12#include <functional>
Marat Dukhan5ce30d92020-04-14 03:31:26 -070013#include <limits>
XNNPACK Teamb455b122019-09-27 18:10:33 -070014#include <ostream>
15#include <random>
16#include <string>
17#include <vector>
18
XNNPACK Teamb455b122019-09-27 18:10:33 -070019#include <xnnpack.h>
20
Frank Barchardbb4c18b2019-09-30 11:05:52 -070021#ifdef BENCHMARK_ARM_COMPUTE_LIBRARY
22#include "arm_compute/core/Types.h"
23#include "arm_compute/runtime/Tensor.h"
24#include "arm_compute/runtime/CPP/CPPScheduler.h"
25#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
26#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
27#endif // BENCHMARK_ARM_COMPUTE_LIBRARY
XNNPACK Teamb455b122019-09-27 18:10:33 -070028#include <benchmark/benchmark.h>
Frank Barchard49b4dcc2020-06-26 14:07:19 -070029#include <fp16.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070030#ifdef BENCHMARK_TENSORFLOW_LITE
31#include "flatbuffers/include/flatbuffers/flatbuffers.h"
32#include "tensorflow/lite/interpreter.h"
33#include "tensorflow/lite/kernels/register.h"
34#include "tensorflow/lite/model.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070035#include "tensorflow/lite/schema/schema_generated.h"
36#include "tensorflow/lite/version.h"
37#endif // BENCHMARK_TENSORFLOW_LITE
Frank Barchardbb4c18b2019-09-30 11:05:52 -070038#include "bench/utils.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070039
40
41void xnnpack_convolution_q8(benchmark::State& state, const char* net) {
42 const size_t batch_size = state.range(0);
43 const size_t input_height = state.range(1);
44 const size_t input_width = state.range(2);
45 const size_t kernel_height = state.range(3);
46 const size_t kernel_width = state.range(4);
47 const size_t padding_height = state.range(5);
48 const size_t padding_width = state.range(6);
49 const size_t subsampling = state.range(7);
50 const size_t dilation = state.range(8);
51 const size_t groups = state.range(9);
52 const size_t group_input_channels = state.range(10);
53 const size_t group_output_channels = state.range(11);
54
55 std::random_device random_device;
56 auto rng = std::mt19937(random_device());
57 auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
Marat Dukhan5ce30d92020-04-14 03:31:26 -070058 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
XNNPACK Teamb455b122019-09-27 18:10:33 -070059
60 const size_t output_pixel_stride = groups * group_output_channels;
61 const size_t input_pixel_stride = groups * group_input_channels;
62 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
63 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
64 const size_t padding_left = padding_width / 2;
65 const size_t padding_top = padding_height / 2;
66 const size_t padding_right = padding_width - padding_left;
67 const size_t padding_bottom = padding_height - padding_top;
68 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
69 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
70
71 std::vector<uint8_t> input(batch_size * input_height * input_width * input_pixel_stride);
72 std::generate(input.begin(), input.end(), std::ref(u8rng));
73 std::vector<uint8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
74 std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
75 std::vector<int32_t> bias(groups * group_output_channels);
76 std::generate(bias.begin(), bias.end(), std::ref(s32rng));
77 const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
78
Marat Dukhan04f03be2019-11-19 12:36:47 -080079 xnn_status status = xnn_initialize(nullptr /* allocator */);
XNNPACK Teamb455b122019-09-27 18:10:33 -070080 if (status != xnn_status_success) {
81 state.SkipWithError("failed to initialize XNNPACK");
82 return;
83 }
84
XNNPACK Teamb455b122019-09-27 18:10:33 -070085 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070086 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070087 sizeof(uint8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(uint8_t) * output_elements);
88 std::vector<uint8_t> output(output_elements * num_buffers);
89
90 std::vector<xnn_operator_t> convolution_operators(num_buffers);
91 for (xnn_operator_t& convolution_op : convolution_operators) {
92 status = xnn_create_convolution2d_nhwc_q8(
93 padding_top, padding_right, padding_bottom, padding_left,
94 kernel_height, kernel_width,
95 subsampling, subsampling,
96 dilation, dilation,
97 groups, group_input_channels, group_output_channels,
98 input_pixel_stride, output_pixel_stride,
99 127, 0.5f,
100 127, 0.5f,
101 kernel.data(), bias.data(),
102 127, 0.5f, 0, 255,
103 0 /* flags */, &convolution_op);
104 if (status != xnn_status_success) {
105 state.SkipWithError("failed to create QINT8 Convolution operator");
106 return;
107 }
108 }
109
110 for (size_t i = 0; i < convolution_operators.size(); i++) {
111 status = xnn_setup_convolution2d_nhwc_q8(
112 convolution_operators[i],
113 batch_size, input_height, input_width,
114 input.data(), output.data() + i * output_elements,
115 nullptr /* thread pool */);
116 if (status != xnn_status_success) {
117 state.SkipWithError("failed to setup QINT8 Convolution operator");
118 return;
119 }
120 }
121
122 size_t buffer_index = 0;
123 for (auto _ : state) {
124 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700125 benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700126 buffer_index = (buffer_index + 1) % num_buffers;
127 state.ResumeTiming();
128
129 status = xnn_run_operator(convolution_operators[buffer_index],
130 nullptr /* thread pool */);
131 if (status != xnn_status_success) {
132 state.SkipWithError("failed to run QINT8 Convolution operator");
133 return;
134 }
135 }
136
137 for (xnn_operator_t& convolution_op : convolution_operators) {
138 status = xnn_delete_operator(convolution_op);
139 if (status != xnn_status_success) {
140 state.SkipWithError("failed to delete QINT8 Convolution operator");
141 return;
142 }
143 convolution_op = nullptr;
144 }
145
Frank Barchardbb4c18b2019-09-30 11:05:52 -0700146 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
XNNPACK Teamb455b122019-09-27 18:10:33 -0700147 state.counters["OPS"] = benchmark::Counter(
148 uint64_t(state.iterations()) * 2 *
149 batch_size * output_height * output_width *
150 groups * group_input_channels * group_output_channels *
151 kernel_height * kernel_width,
152 benchmark::Counter::kIsRate);
153}
154
Frank Barchard49b4dcc2020-06-26 14:07:19 -0700155void xnnpack_convolution_f16(benchmark::State& state, const char* net) {
156 if (!benchmark::utils::CheckNEONFP16ARITH(state)) {
157 return;
158 }
159 const size_t batch_size = state.range(0);
160 const size_t input_height = state.range(1);
161 const size_t input_width = state.range(2);
162 const size_t kernel_height = state.range(3);
163 const size_t kernel_width = state.range(4);
164 const size_t padding_height = state.range(5);
165 const size_t padding_width = state.range(6);
166 const size_t subsampling = state.range(7);
167 const size_t dilation = state.range(8);
168 const size_t groups = state.range(9);
169 const size_t group_input_channels = state.range(10);
170 const size_t group_output_channels = state.range(11);
171
172 std::random_device random_device;
173 auto rng = std::mt19937(random_device());
174 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
175 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
176
177 const size_t output_pixel_stride = groups * group_output_channels;
178 const size_t input_pixel_stride = groups * group_input_channels;
179 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
180 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
181 const size_t padding_left = padding_width / 2;
182 const size_t padding_top = padding_height / 2;
183 const size_t padding_right = padding_width - padding_left;
184 const size_t padding_bottom = padding_height - padding_top;
185 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
186 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
187
188 std::vector<uint16_t> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));
189 std::generate(input.begin(), input.end(), std::ref(f16rng));
190 std::vector<uint16_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
191 std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
192 std::vector<uint16_t> bias(groups * group_output_channels);
193 std::generate(bias.begin(), bias.end(), std::ref(f16rng));
194 const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
195
196 xnn_status status = xnn_initialize(nullptr /* allocator */);
197 if (status != xnn_status_success) {
198 state.SkipWithError("failed to initialize XNNPACK");
199 return;
200 }
201
202 const size_t num_buffers = 1 +
203 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
204 sizeof(uint16_t) * (kernel.size() + bias.size() + output_elements));
205 std::vector<uint16_t> output(output_elements * num_buffers);
206
207 std::vector<xnn_operator_t> convolution_operators(num_buffers);
208 for (xnn_operator_t& convolution_op : convolution_operators) {
209 status = xnn_create_convolution2d_nhwc_f16(
210 padding_top, padding_right, padding_bottom, padding_left,
211 kernel_height, kernel_width,
212 subsampling, subsampling,
213 dilation, dilation,
214 groups, group_input_channels, group_output_channels,
215 input_pixel_stride, output_pixel_stride,
216 kernel.data(), bias.data(),
217 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
218 0 /* flags */, &convolution_op);
219 if (status != xnn_status_success) {
220 state.SkipWithError("failed to create FP16 Convolution operator");
221 return;
222 }
223 }
224
225 for (size_t i = 0; i < convolution_operators.size(); i++) {
226 status = xnn_setup_convolution2d_nhwc_f16(
227 convolution_operators[i],
228 batch_size, input_height, input_width,
229 input.data(), output.data() + i * output_elements,
230 nullptr /* thread pool */);
231 if (status != xnn_status_success) {
232 state.SkipWithError("failed to setup FP16 Convolution operator");
233 return;
234 }
235 }
236
237 size_t buffer_index = 0;
238 for (auto _ : state) {
239 state.PauseTiming();
240 benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));
241 buffer_index = (buffer_index + 1) % num_buffers;
242 state.ResumeTiming();
243
244 status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
245 if (status != xnn_status_success) {
246 state.SkipWithError("failed to run FP16 Convolution operator");
247 return;
248 }
249 }
250
251 for (xnn_operator_t& convolution_op : convolution_operators) {
252 status = xnn_delete_operator(convolution_op);
253 if (status != xnn_status_success) {
254 state.SkipWithError("failed to delete FP16 Convolution operator");
255 return;
256 }
257 convolution_op = nullptr;
258 }
259
260 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
261 state.counters["FLOPS"] = benchmark::Counter(
262 uint64_t(state.iterations()) * 2 *
263 batch_size * output_height * output_width *
264 groups * group_input_channels * group_output_channels *
265 kernel_height * kernel_width,
266 benchmark::Counter::kIsRate);
267}
268
XNNPACK Teamb455b122019-09-27 18:10:33 -0700269void xnnpack_convolution_f32(benchmark::State& state, const char* net) {
270 const size_t batch_size = state.range(0);
271 const size_t input_height = state.range(1);
272 const size_t input_width = state.range(2);
273 const size_t kernel_height = state.range(3);
274 const size_t kernel_width = state.range(4);
275 const size_t padding_height = state.range(5);
276 const size_t padding_width = state.range(6);
277 const size_t subsampling = state.range(7);
278 const size_t dilation = state.range(8);
279 const size_t groups = state.range(9);
280 const size_t group_input_channels = state.range(10);
281 const size_t group_output_channels = state.range(11);
282
283 std::random_device random_device;
284 auto rng = std::mt19937(random_device());
285 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
286
287 const size_t output_pixel_stride = groups * group_output_channels;
288 const size_t input_pixel_stride = groups * group_input_channels;
289 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
290 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
291 const size_t padding_left = padding_width / 2;
292 const size_t padding_top = padding_height / 2;
293 const size_t padding_right = padding_width - padding_left;
294 const size_t padding_bottom = padding_height - padding_top;
295 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
296 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
297
298 std::vector<float> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));
299 std::generate(input.begin(), input.end(), std::ref(f32rng));
300 std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
301 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
302 std::vector<float> bias(groups * group_output_channels);
303 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
304 const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;
305
Marat Dukhan04f03be2019-11-19 12:36:47 -0800306 xnn_status status = xnn_initialize(nullptr /* allocator */);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700307 if (status != xnn_status_success) {
308 state.SkipWithError("failed to initialize XNNPACK");
309 return;
310 }
311
XNNPACK Teamb455b122019-09-27 18:10:33 -0700312 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700313 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700314 sizeof(float) * (kernel.size() + bias.size() + output_elements));
315 std::vector<float> output(output_elements * num_buffers);
316
317 std::vector<xnn_operator_t> convolution_operators(num_buffers);
318 for (xnn_operator_t& convolution_op : convolution_operators) {
319 status = xnn_create_convolution2d_nhwc_f32(
320 padding_top, padding_right, padding_bottom, padding_left,
321 kernel_height, kernel_width,
322 subsampling, subsampling,
323 dilation, dilation,
324 groups, group_input_channels, group_output_channels,
325 input_pixel_stride, output_pixel_stride,
326 kernel.data(), bias.data(),
327 -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),
328 0 /* flags */, &convolution_op);
329 if (status != xnn_status_success) {
330 state.SkipWithError("failed to create FP32 Convolution operator");
331 return;
332 }
333 }
334
335 for (size_t i = 0; i < convolution_operators.size(); i++) {
336 status = xnn_setup_convolution2d_nhwc_f32(
337 convolution_operators[i],
338 batch_size, input_height, input_width,
339 input.data(), output.data() + i * output_elements,
340 nullptr /* thread pool */);
341 if (status != xnn_status_success) {
342 state.SkipWithError("failed to setup FP32 Convolution operator");
343 return;
344 }
345 }
346
347 size_t buffer_index = 0;
348 for (auto _ : state) {
349 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700350 benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700351 buffer_index = (buffer_index + 1) % num_buffers;
352 state.ResumeTiming();
353
354 status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);
355 if (status != xnn_status_success) {
356 state.SkipWithError("failed to run FP32 Convolution operator");
357 return;
358 }
359 }
360
361 for (xnn_operator_t& convolution_op : convolution_operators) {
362 status = xnn_delete_operator(convolution_op);
363 if (status != xnn_status_success) {
364 state.SkipWithError("failed to delete FP32 Convolution operator");
365 return;
366 }
367 convolution_op = nullptr;
368 }
369
370 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
371 state.counters["FLOPS"] = benchmark::Counter(
372 uint64_t(state.iterations()) * 2 *
373 batch_size * output_height * output_width *
374 groups * group_input_channels * group_output_channels *
375 kernel_height * kernel_width,
376 benchmark::Counter::kIsRate);
377}
378
379#ifdef BENCHMARK_TENSORFLOW_LITE
380void tflite_convolution_f32(benchmark::State& state, const char* net) {
381 const size_t batch_size = state.range(0);
382 const size_t input_height = state.range(1);
383 const size_t input_width = state.range(2);
384 const size_t kernel_height = state.range(3);
385 const size_t kernel_width = state.range(4);
386 const size_t padding_height = state.range(5);
387 const size_t padding_width = state.range(6);
388 const size_t subsampling = state.range(7);
389 const size_t dilation = state.range(8);
390 const size_t groups = state.range(9);
391 const size_t group_input_channels = state.range(10);
392 const size_t group_output_channels = state.range(11);
393
394 bool is_depthwise = false;
395 if (groups != 1) {
396 if (group_input_channels == 1) {
397 is_depthwise = true;
398 } else {
399 state.SkipWithError("grouped convolution is not supported");
400 return;
401 }
402 }
403
404 std::random_device random_device;
405 auto rng = std::mt19937(random_device());
406 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
407
408 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
409 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
410
411 tflite::Padding padding = tflite::Padding_VALID;
412 if (padding_width == (effective_kernel_width - 1) && padding_height == (effective_kernel_height - 1)) {
413 padding = tflite::Padding_SAME;
414 } else if (padding_width == 0 && padding_height == 0) {
415 padding = tflite::Padding_VALID;
416 } else {
417 state.SkipWithError("unsupported padding");
418 return;
419 }
420
421 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
422 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
423
424 std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);
425 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
426 std::vector<float> bias(groups * group_output_channels);
427 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
428
429 flatbuffers::FlatBufferBuilder builder;
430 flatbuffers::Offset<tflite::OperatorCode> operator_code =
431 CreateOperatorCode(
432 builder,
433 is_depthwise ? tflite::BuiltinOperator_DEPTHWISE_CONV_2D : tflite::BuiltinOperator_CONV_2D,
434 0);
435
436 flatbuffers::Offset<tflite::Conv2DOptions> conv2d_options = CreateConv2DOptions(
437 builder,
438 padding,
439 static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
440 tflite::ActivationFunctionType_NONE,
441 static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
442
443 flatbuffers::Offset<tflite::DepthwiseConv2DOptions> dwconv2d_options = CreateDepthwiseConv2DOptions(
444 builder,
445 padding,
446 static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),
447 static_cast<int32_t>(group_output_channels),
448 tflite::ActivationFunctionType_NONE,
449 static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));
450
451 flatbuffers::Offset<tflite::Buffer> buffers[3] = {
452 tflite::CreateBuffer(builder, builder.CreateVector({})),
453 tflite::CreateBuffer(builder, builder.CreateVector(
454 reinterpret_cast<const uint8_t*>(kernel.data()),
455 sizeof(float) * kernel.size())),
456 tflite::CreateBuffer(builder, builder.CreateVector(
457 reinterpret_cast<const uint8_t*>(bias.data()),
458 sizeof(float) * bias.size())),
459 };
460
461 const int32_t input_shape[4] = {
462 static_cast<int32_t>(batch_size),
463 static_cast<int32_t>(input_height),
464 static_cast<int32_t>(input_width),
465 static_cast<int32_t>(groups * group_input_channels)
466 };
467 const int32_t output_shape[4] = {
468 static_cast<int32_t>(batch_size),
469 static_cast<int32_t>(output_height),
470 static_cast<int32_t>(output_width),
471 static_cast<int32_t>(groups * group_output_channels)
472 };
473 const int32_t filter_shape[4] = {
474 static_cast<int32_t>(group_output_channels),
475 static_cast<int32_t>(kernel_height),
476 static_cast<int32_t>(kernel_width),
477 static_cast<int32_t>(groups * group_input_channels)
478 };
479 const int32_t bias_shape[1] = {
480 static_cast<int32_t>(groups * group_output_channels)
481 };
482
483 flatbuffers::Offset<tflite::Tensor> tensors[4] = {
484 tflite::CreateTensor(builder,
485 builder.CreateVector<int32_t>(input_shape, 4),
486 tflite::TensorType_FLOAT32,
487 0 /* buffer id */,
488 builder.CreateString("input")),
489 tflite::CreateTensor(builder,
490 builder.CreateVector<int32_t>(filter_shape, 4),
491 tflite::TensorType_FLOAT32,
492 1 /* buffer id */,
493 builder.CreateString("filter")),
494 tflite::CreateTensor(builder,
495 builder.CreateVector<int32_t>(bias_shape, 1),
496 tflite::TensorType_FLOAT32,
497 2 /* buffer id */,
498 builder.CreateString("bias")),
499 tflite::CreateTensor(builder,
500 builder.CreateVector<int32_t>(output_shape, 4),
501 tflite::TensorType_FLOAT32,
502 0 /* buffer id */,
503 builder.CreateString("output")),
504 };
505
506 const int32_t op_inputs[3] = { 0, 1, 2 };
507 const int32_t op_outputs[1] = { 3 };
508 flatbuffers::Offset<tflite::Operator> op = CreateOperator(
509 builder,
510 0 /* opcode_index */,
511 builder.CreateVector<int32_t>(op_inputs, 3),
512 builder.CreateVector<int32_t>(op_outputs, 1),
513 is_depthwise ? tflite::BuiltinOptions_DepthwiseConv2DOptions : tflite::BuiltinOptions_Conv2DOptions,
514 is_depthwise ? dwconv2d_options.Union() : conv2d_options.Union(),
515 /*custom_options */ 0,
516 tflite::CustomOptionsFormat_FLEXBUFFERS);
517
518 const int32_t graph_inputs[1] = { 0 };
519 const int32_t graph_outputs[1] = { 3 };
520 flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(
521 builder,
522 builder.CreateVector(tensors, 4),
523 builder.CreateVector<int32_t>(graph_inputs, 1),
524 builder.CreateVector<int32_t>(graph_outputs, 1),
525 builder.CreateVector(&op, 1),
526 builder.CreateString("Conv2D subgraph"));
527
528 flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Conv2D model");
529
530 flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
531 TFLITE_SCHEMA_VERSION,
532 builder.CreateVector(&operator_code, 1),
533 builder.CreateVector(&subgraph, 1),
534 description,
535 builder.CreateVector(buffers, 3));
536
537 builder.Finish(model_buffer);
538
539 const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
540 tflite::ops::builtin::BuiltinOpResolver resolver;
541 tflite::InterpreterBuilder interpreterBuilder(model, resolver);
542 std::unique_ptr<tflite::Interpreter> interpreter;
543 if (interpreterBuilder(&interpreter) != kTfLiteOk) {
544 state.SkipWithError("failed to create TFLite interpreter");
545 return;
546 }
547 if (interpreter == nullptr) {
548 state.SkipWithError("TFLite interpreter is null");
549 return;
550 }
551 interpreter->SetNumThreads(1);
552
553 if (interpreter->AllocateTensors() != kTfLiteOk) {
554 state.SkipWithError("failed to allocate tensors");
555 return;
556 }
557
558 std::generate(
559 interpreter->typed_tensor<float>(0),
560 interpreter->typed_tensor<float>(0) + batch_size * groups * group_input_channels * input_height * input_width,
561 std::ref(f32rng));
562
563 for (auto _ : state) {
564 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700565 benchmark::utils::WipeCache();
566 benchmark::utils::PrefetchToL1(
XNNPACK Teamb455b122019-09-27 18:10:33 -0700567 interpreter->typed_tensor<float>(0),
568 batch_size * groups * group_input_channels * input_height * input_width * sizeof(float));
569 state.ResumeTiming();
570
571 if (interpreter->Invoke() != kTfLiteOk) {
572 state.SkipWithError("failed to invoke TFLite interpreter");
573 return;
574 }
575 }
576
577 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
578 state.counters["FLOPS"] = benchmark::Counter(
579 uint64_t(state.iterations()) * 2 *
580 batch_size * output_height * output_width *
581 groups * group_input_channels * group_output_channels *
582 kernel_height * kernel_width,
583 benchmark::Counter::kIsRate);
584
585 interpreter.reset();
586}
587#endif // BENCHMARK_TENSORFLOW_LITE
588
589#ifdef BENCHMARK_ARM_COMPUTE_LIBRARY
590static std::string compare_with_convolution_f32_reference_output(
591 const benchmark::State& state, const float* input, size_t input_size,
592 const float* kernel, size_t kernel_size, const float* bias, size_t bias_size,
593 const float* output, size_t output_size)
594{
595 const size_t batch_size = state.range(0);
596 const size_t input_height = state.range(1);
597 const size_t input_width = state.range(2);
598 const size_t kernel_height = state.range(3);
599 const size_t kernel_width = state.range(4);
600 const size_t padding_height = state.range(5);
601 const size_t padding_width = state.range(6);
602 const size_t subsampling = state.range(7);
603 const size_t dilation = state.range(8);
604 const size_t groups = state.range(9);
605 const size_t group_input_channels = state.range(10);
606 const size_t group_output_channels = state.range(11);
607
608 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
609 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
610 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
611 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
612 const size_t input_pixel_stride = groups * group_input_channels;
613 const size_t padding_left = padding_width / 2;
614 const size_t padding_top = padding_height / 2;
615
616 assert(input_size == batch_size * input_height * input_width * groups * group_input_channels);
617
618 assert(kernel_size == group_output_channels * kernel_height * kernel_width * groups * group_input_channels);
619
620 assert(bias_size == groups * group_output_channels);
621
622 assert(output_size == batch_size * output_height * output_width * groups * group_output_channels);
623
624 std::vector<float> output_ref(output_size);
625 for (size_t i = 0; i < batch_size; i++) {
626 for (size_t oy = 0; oy < output_height; oy++) {
627 for (size_t ox = 0; ox < output_width; ox++) {
628 for (size_t g = 0; g < groups; g++) {
629 for (size_t oc = 0; oc < group_output_channels; oc++) {
630 output_ref[(((i * output_height + oy) * output_width + ox) * groups + g) * group_output_channels + oc] =
631 bias[g * group_output_channels + oc];
632 }
633 }
634 }
635 }
636 }
637 for (size_t i = 0; i < batch_size; i++) {
638 for (size_t oy = 0; oy < output_height; oy++) {
639 for (size_t ox = 0; ox < output_width; ox++) {
640 for (size_t ky = 0; ky < kernel_height; ky++) {
641 const size_t iy = oy * subsampling + ky * dilation - padding_top;
642 if (iy < input_height) {
643 for (size_t kx = 0; kx < kernel_width; kx++) {
644 const size_t ix = ox * subsampling + kx * dilation - padding_left;
645 if (ix < input_width) {
646 for (size_t g = 0; g < groups; g++) {
647 for (size_t oc = 0; oc < group_output_channels; oc++) {
648 for (size_t ic = 0; ic < group_input_channels; ic++) {
649 output_ref[(((i * output_height + oy) * output_width + ox) * groups + g) * group_output_channels + oc] +=
650 input[((i * input_height + iy) * input_width + ix) * input_pixel_stride + g * group_input_channels + ic] *
651 kernel[(((oc * kernel_height + ky) * kernel_width + kx) * groups + g) * group_input_channels + ic];
652 } // group_input_channels loop
653 } // group_output_channels loop
654 } // groups loop
655 }
656 } // kernel_width loop
657 }
658 } // kernel_height loop
659 } // output_width loop
660 } // output_height loop
661 } // batch_size loop
662
663 const float relative_error_tolerance = 1e-4;
664 for (size_t i = 0; i < batch_size; i++) {
665 for (size_t y = 0; y < output_height; y++) {
666 for (size_t x = 0; x < output_width; x++) {
667 for (size_t g = 0; g < groups; g++) {
668 for (size_t c = 0; c < group_output_channels; c++) {
669 const size_t idx = (((i * output_height + y) * output_width + x) * groups + g) * group_output_channels + c;
670 const float value_ref = output_ref[idx];
671 const float value = output[idx];
672 if (std::abs(value - value_ref) > std::max(std::abs(value_ref) * relative_error_tolerance, std::numeric_limits<float>::epsilon())) {
673 std::ostringstream error_stream;
674 error_stream << "(x, y) = (" << x << ", " << y << "), group = " << g
675 << ", channel = " << c << ", refValue = " << value_ref
676 << ", actualValue = " << value
677 << ", absDiff=" << std::abs(value - value_ref);
678 return error_stream.str();
679 }
680 }
681 }
682 }
683 }
684 }
685 return "";
686}
687
688void armcl_convolution_f32(benchmark::State& state, const char* net) {
689 const size_t batch_size = state.range(0);
690 const size_t input_height = state.range(1);
691 const size_t input_width = state.range(2);
692 const size_t kernel_height = state.range(3);
693 const size_t kernel_width = state.range(4);
694 const size_t padding_height = state.range(5);
695 const size_t padding_width = state.range(6);
696 const size_t subsampling = state.range(7);
697 const size_t dilation = state.range(8);
698 const size_t groups = state.range(9);
699 const size_t group_input_channels = state.range(10);
700 const size_t group_output_channels = state.range(11);
701
702 const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;
703 const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;
704 const size_t padding_left = padding_width / 2;
705 const size_t padding_top = padding_height / 2;
706 const size_t padding_right = padding_width - padding_left;
707 const size_t padding_bottom = padding_height - padding_top;
708 const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;
709 const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;
710
711 arm_compute::PadStrideInfo pad_stride_info(
712 subsampling /* stride height */,
713 subsampling /* stride width */,
714 padding_left, padding_right, padding_top, padding_bottom,
715 arm_compute::DimensionRoundingType::FLOOR);
716 arm_compute::Size2D dilation_info(dilation, dilation);
717 // Note: activation is disabled by default.
718 arm_compute::ActivationLayerInfo activation_info;
719
720 // Note: no batch size and reverse order of dimensions, i.e. CWHN for NHWC.
721 arm_compute::TensorShape input_shape(
722 /* C */ groups * group_input_channels,
723 /* W */ input_width,
724 /* H */ input_height,
725 /* N */ batch_size);
726 arm_compute::TensorInfo input_info(
727 input_shape,
728 1 /* number of channels per element (!) */,
729 arm_compute::DataType::F32);
730 input_info.set_data_layout(arm_compute::DataLayout::NHWC);
731 arm_compute::Tensor input_tensor;
732 input_tensor.allocator()->init(input_info);
733 input_tensor.allocator()->allocate();
734
735 // Note: reverse order of dimensions, i.e. for IWHO for OHWI.
736 arm_compute::TensorShape kernel_shape(
737 /* I */ groups * group_input_channels,
738 /* W */ kernel_width,
739 /* H */ kernel_height,
740 /* O */ group_output_channels);
741 arm_compute::TensorInfo kernel_info(
742 kernel_shape,
743 1 /* number of channels per element (!) */,
744 arm_compute::DataType::F32);
745 kernel_info.set_data_layout(arm_compute::DataLayout::NHWC);
746 arm_compute::Tensor kernelTensor;
747 kernelTensor.allocator()->init(kernel_info);
748 kernelTensor.allocator()->allocate();
749
750 arm_compute::TensorShape bias_shape(groups * group_output_channels);
751 arm_compute::TensorInfo bias_info(
752 bias_shape,
753 1 /* number of channels per element (!) */,
754 arm_compute::DataType::F32);
755 bias_info.set_data_layout(arm_compute::DataLayout::NHWC);
756 arm_compute::Tensor bias_tensor;
757 bias_tensor.allocator()->init(bias_info);
758 bias_tensor.allocator()->allocate();
759
760 // Note: no batch size and reverse order of dimensions, i.e. CWHN for NHWC.
761 arm_compute::TensorShape output_shape(
762 /* C */ groups * group_output_channels,
763 /* W */ output_width,
764 /* H */ output_height,
765 /* N */ batch_size);
766 arm_compute::TensorInfo output_info(
767 output_shape,
768 1 /* number of channels per element (!) */,
769 arm_compute::DataType::F32);
770 output_info.set_data_layout(arm_compute::DataLayout::NHWC);
771 arm_compute::Tensor output_tensor;
772 output_tensor.allocator()->init(output_info);
773 output_tensor.allocator()->allocate();
774
775 std::random_device random_device;
776 auto rng = std::mt19937(random_device());
777 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
778
779 std::generate(
780 reinterpret_cast<float*>(input_tensor.buffer()),
781 reinterpret_cast<float*>(input_tensor.buffer()) + input_shape.total_size(),
782 std::ref(f32rng));
783 std::generate(
784 reinterpret_cast<float*>(kernelTensor.buffer()),
785 reinterpret_cast<float*>(kernelTensor.buffer()) + kernel_shape.total_size(),
786 std::ref(f32rng));
787 std::generate(
788 reinterpret_cast<float*>(bias_tensor.buffer()),
789 reinterpret_cast<float*>(bias_tensor.buffer()) + bias_shape.total_size(),
790 std::ref(f32rng));
791 std::generate(
792 reinterpret_cast<float*>(output_tensor.buffer()),
793 reinterpret_cast<float*>(output_tensor.buffer()) + output_shape.total_size(),
794 std::ref(f32rng));
795
796 bool is_depthwise = false;
797 if (groups != 1) {
798 // NEConvolutionLayer uses NEGEMMConvolutionLayer by default, which doesn't support grouped convolution.
799 // However, depthwise convolution is supported via NEDepthwiseConvolutionLayer.
800 if (group_input_channels == 1) {
801 is_depthwise = true;
802 } else {
803 state.SkipWithError("grouped convolution is not supported");
804 return;
805 }
806 }
807
808 std::shared_ptr<arm_compute::IFunction> layer;
809 if (is_depthwise) {
810 if (dilation != 1) {
811 state.SkipWithError("dilated depthwise convolution is not supported");
812 return;
813 }
814
815 // Avoid NEDepthwiseConvolutionLayer3x3 when stride isn't 2 in order to pass the output verification.
816 // TODO(b/130206370) This looks like a bug and needs further investigation.
817 if (kernel_height == 3 && kernel_width == 3 && subsampling == 2) {
818 auto* depthwise_3x3_convolution_layer = new arm_compute::NEDepthwiseConvolutionLayer3x3();
819 layer.reset(depthwise_3x3_convolution_layer);
820 depthwise_3x3_convolution_layer->configure(
821 &input_tensor, &kernelTensor, &bias_tensor, &output_tensor,
822 pad_stride_info, group_output_channels, activation_info);
823
824 if (!depthwise_3x3_convolution_layer->validate(
825 &input_info, &kernel_info, &bias_info, &output_info,
826 pad_stride_info, group_output_channels, activation_info))
827 {
828 state.SkipWithError("validation failed");
829 return;
830 }
831 } else {
832 auto* depthwise_convolution_layer = new arm_compute::NEDepthwiseConvolutionLayer();
833 layer.reset(depthwise_convolution_layer);
834 depthwise_convolution_layer->configure(
835 &input_tensor, &kernelTensor, &bias_tensor, &output_tensor,
836 pad_stride_info, group_output_channels, activation_info);
837
838 if (!depthwise_convolution_layer->validate(
839 &input_info, &kernel_info, &bias_info, &output_info,
840 pad_stride_info, group_output_channels, activation_info))
841 {
842 state.SkipWithError("validation failed");
843 return;
844 }
845 }
846 } else {
847 auto* convolution_layer = new arm_compute::NEConvolutionLayer();
848 layer.reset(convolution_layer);
849 convolution_layer->configure(
850 &input_tensor, &kernelTensor, &bias_tensor, &output_tensor,
851 pad_stride_info, arm_compute::WeightsInfo(), dilation_info, activation_info,
852 true /* enable fast math */, groups);
853
854 if (!convolution_layer->validate(
855 &input_info, &kernel_info, &bias_info, &output_info,
856 pad_stride_info, arm_compute::WeightsInfo(), dilation_info, activation_info,
857 true /* enable fast math */, groups))
858 {
859 state.SkipWithError("validation failed");
860 return;
861 }
862 }
863
864 // Dry run to let ACL do one-time initializations.
865 arm_compute::CPPScheduler::get().set_num_threads(1);
866 layer->run();
867
868 for (auto _ : state) {
869 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700870 benchmark::utils::WipeCache();
871 benchmark::utils::PrefetchToL1(
XNNPACK Teamb455b122019-09-27 18:10:33 -0700872 input_tensor.buffer(),
873 batch_size * groups * group_input_channels * input_height * input_width * sizeof(float));
874 state.ResumeTiming();
875
876 layer->run();
877 }
878
879 // Validate outputs.
880 const std::string error_string = compare_with_convolution_f32_reference_output(
881 state, reinterpret_cast<const float*>(input_tensor.buffer()),
882 input_shape.total_size(),
883 reinterpret_cast<const float*>(kernelTensor.buffer()),
884 kernel_shape.total_size(),
885 reinterpret_cast<const float*>(bias_tensor.buffer()),
886 bias_shape.total_size(),
887 reinterpret_cast<const float*>(output_tensor.buffer()),
888 output_shape.total_size());
889
890 if (!error_string.empty()) {
891 state.SkipWithError(("validation failed: " + error_string).c_str());
892 return;
893 }
894
895 input_tensor.allocator()->free();
896 kernelTensor.allocator()->free();
897 bias_tensor.allocator()->free();
898 output_tensor.allocator()->free();
899
900 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
901 state.counters["FLOPS"] = benchmark::Counter(
902 uint64_t(state.iterations()) * 2 *
903 batch_size * output_height * output_width *
904 groups * group_input_channels * group_output_channels *
905 kernel_height * kernel_width,
906 benchmark::Counter::kIsRate);
907}
908#endif // BENCHMARK_ARM_COMPUTE_LIBRARY
909
910// ShuffleNet v1 with 1 group.
911static void ShuffleNetV1G1(benchmark::internal::Benchmark* b) {
912 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
913
914 /*************************** Conv 1 **************************/
915 /* N H W KH KW PH PW S D G GCin GCout */
916 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
917 /******************* Stage 2: stride-2 unit ******************/
918 /* N H W KH KW PH PW S D G GCin GCout */
919 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 36});
920 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 36, 1, 1});
921 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 36, 120});
922 /******************* Stage 2: stride-1 units *****************/
923 /* N H W KH KW PH PW S D G GCin GCout */
924 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 144, 36});
925 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 36, 1, 1});
926 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 36, 144});
927 /******************* Stage 3: stride-2 unit ******************/
928 /* N H W KH KW PH PW S D G GCin GCout */
929 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 144, 72});
930 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 72, 1, 1});
931 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 72, 144});
932 /******************* Stage 3: stride-1 units *****************/
933 /* N H W KH KW PH PW S D G GCin GCout */
934 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 288, 72});
935 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 72, 1, 1});
936 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 72, 288});
937 /******************* Stage 4: stride-2 unit ******************/
938 /* N H W KH KW PH PW S D G GCin GCout */
939 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 288, 144});
940 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 144, 1, 1});
941 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 144, 288});
942 /******************* Stage 4: stride-1 units *****************/
943 /* N H W KH KW PH PW S D G GCin GCout */
944 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 144});
945 b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 144, 1, 1});
946 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 144, 576});
947}
948
949// ShuffleNet v1 with 2 groups.
950static void ShuffleNetV1G2(benchmark::internal::Benchmark* b) {
951 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
952
953 /*************************** Conv 1 **************************/
954 /* N H W KH KW PH PW S D G GCin GCout */
955 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
956 /******************* Stage 2: stride-2 unit ******************/
957 /* N H W KH KW PH PW S D G GCin GCout */
958 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 50});
959 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 50, 1, 1});
960 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 25, 88});
961 /******************* Stage 2: stride-1 units *****************/
962 /* N H W KH KW PH PW S D G GCin GCout */
963 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 100, 25});
964 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 50, 1, 1});
965 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 25, 100});
966 /******************* Stage 3: stride-2 unit ******************/
967 /* N H W KH KW PH PW S D G GCin GCout */
968 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 100, 50});
969 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 100, 1, 1});
970 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 50, 100});
971 /******************* Stage 3: stride-1 units *****************/
972 /* N H W KH KW PH PW S D G GCin GCout */
973 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 200, 50});
974 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 100, 1, 1});
975 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 50, 200});
976 /******************* Stage 4: stride-2 unit ******************/
977 /* N H W KH KW PH PW S D G GCin GCout */
978 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 200, 100});
979 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 200, 1, 1});
980 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 2, 100, 200});
981 /******************* Stage 4: stride-1 units *****************/
982 /* N H W KH KW PH PW S D G GCin GCout */
983 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 2, 400, 100});
984 b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 200, 1, 1});
985 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 2, 100, 400});
986}
987
988// ShuffleNet v1 with 3 groups.
989static void ShuffleNetV1G3(benchmark::internal::Benchmark* b) {
990 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
991
992 /*************************** Conv 1 **************************/
993 /* N H W KH KW PH PW S D G GCin GCout */
994 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
995 /******************* Stage 2: stride-2 unit ******************/
996 /* N H W KH KW PH PW S D G GCin GCout */
997 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 60});
998 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 60, 1, 1});
999 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 20, 72});
1000 /******************* Stage 2: stride-1 units *****************/
1001 /* N H W KH KW PH PW S D G GCin GCout */
1002 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 80, 20});
1003 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 60, 1, 1});
1004 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 20, 80});
1005 /******************* Stage 3: stride-2 unit ******************/
1006 /* N H W KH KW PH PW S D G GCin GCout */
1007 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 80, 40});
1008 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 120, 1, 1});
1009 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 40, 80});
1010 /******************* Stage 3: stride-1 units *****************/
1011 /* N H W KH KW PH PW S D G GCin GCout */
1012 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 160, 40});
1013 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 120, 1, 1});
1014 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 40, 160});
1015 /******************* Stage 4: stride-2 unit ******************/
1016 /* N H W KH KW PH PW S D G GCin GCout */
1017 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 160, 80});
1018 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 240, 1, 1});
1019 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 3, 80, 160});
1020 /******************* Stage 4: stride-1 units *****************/
1021 /* N H W KH KW PH PW S D G GCin GCout */
1022 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 3, 320, 80});
1023 b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 240, 1, 1});
1024 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 3, 80, 320});
1025}
1026
1027// ShuffleNet v1 with 4 groups.
1028static void ShuffleNetV1G4(benchmark::internal::Benchmark* b) {
1029 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1030
1031 /*************************** Conv 1 **************************/
1032 /* N H W KH KW PH PW S D G GCin GCout */
1033 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
1034 /******************* Stage 2: stride-2 unit ******************/
1035 /* N H W KH KW PH PW S D G GCin GCout */
1036 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 68});
1037 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 68, 1, 1});
1038 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 17, 62});
1039 /******************* Stage 2: stride-1 units *****************/
1040 /* N H W KH KW PH PW S D G GCin GCout */
1041 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 68, 17});
1042 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 68, 1, 1});
1043 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 17, 68});
1044 /******************* Stage 3: stride-2 unit ******************/
1045 /* N H W KH KW PH PW S D G GCin GCout */
1046 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 68, 34});
1047 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 136, 1, 1});
1048 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 34, 68});
1049 /******************* Stage 3: stride-1 units *****************/
1050 /* N H W KH KW PH PW S D G GCin GCout */
1051 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 136, 34});
1052 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 136, 1, 1});
1053 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 34, 136});
1054 /******************* Stage 4: stride-2 unit ******************/
1055 /* N H W KH KW PH PW S D G GCin GCout */
1056 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 136, 68});
1057 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 272, 1, 1});
1058 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 4, 68, 136});
1059 /******************* Stage 4: stride-1 units *****************/
1060 /* N H W KH KW PH PW S D G GCin GCout */
1061 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 4, 272, 68});
1062 b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 272, 1, 1});
1063 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 4, 68, 272});
1064}
1065
1066// ShuffleNet v1 with 8 groups.
1067static void ShuffleNetV1G8(benchmark::internal::Benchmark* b) {
1068 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1069
1070 /*************************** Conv 1 **************************/
1071 /* N H W KH KW PH PW S D G GCin GCout */
1072 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
1073 /******************* Stage 2: stride-2 unit ******************/
1074 /* N H W KH KW PH PW S D G GCin GCout */
1075 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 96});
1076 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 96, 1, 1});
1077 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 12, 45});
1078 /******************* Stage 2: stride-1 units *****************/
1079 /* N H W KH KW PH PW S D G GCin GCout */
1080 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 48, 12});
1081 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 96, 1, 1});
1082 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 12, 48});
1083 /******************* Stage 3: stride-2 unit ******************/
1084 /* N H W KH KW PH PW S D G GCin GCout */
1085 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 48, 24});
1086 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 192, 1, 1});
1087 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 24, 48});
1088 /******************* Stage 3: stride-1 units *****************/
1089 /* N H W KH KW PH PW S D G GCin GCout */
1090 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 96, 24});
1091 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 192, 1, 1});
1092 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 24, 96});
1093 /******************* Stage 4: stride-2 unit ******************/
1094 /* N H W KH KW PH PW S D G GCin GCout */
1095 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 96, 48});
1096 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 384, 1, 1});
1097 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 8, 48, 96});
1098 /******************* Stage 4: stride-1 units *****************/
1099 /* N H W KH KW PH PW S D G GCin GCout */
1100 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 8, 192, 48});
1101 b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 384, 1, 1});
1102 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 8, 48, 192});
1103}
1104
1105// ShuffleNet v2 (0.5X scale)
1106static void ShuffleNetV2X05(benchmark::internal::Benchmark* b) {
1107 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1108
1109 /*************************** Conv 1 **************************/
1110 /* N H W KH KW PH PW S D G GCin GCout */
1111 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
1112 /************************** Stage 2 **************************/
1113 /* N H W KH KW PH PW S D G GCin GCout */
1114 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});
1115 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 24});
1116 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 24});
1117 b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 24, 1, 1});
1118 /************************** Stage 3 **************************/
1119 /* N H W KH KW PH PW S D G GCin GCout */
1120 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 48, 1, 1});
1121 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 48, 48});
1122 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 48, 48});
1123 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 48, 1, 1});
1124 /************************** Stage 4 **************************/
1125 /* N H W KH KW PH PW S D G GCin GCout */
1126 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 96, 1, 1});
1127 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 96});
1128 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 96});
1129 b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 96, 1, 1});
1130 /*************************** Conv 5 **************************/
1131 /* N H W KH KW PH PW S D G GCin GCout */
1132 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 192, 1024});
1133}
1134
1135// ShuffleNet v2 (1.0X scale)
1136static void ShuffleNetV2X10(benchmark::internal::Benchmark* b) {
1137 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1138
1139 /*************************** Conv 1 **************************/
1140 /* N H W KH KW PH PW S D G GCin GCout */
1141 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
1142 /************************** Stage 2 **************************/
1143 /* N H W KH KW PH PW S D G GCin GCout */
1144 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});
1145 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 58});
1146 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 58});
1147 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 58, 1, 1});
1148 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 58, 58});
1149 b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 58, 1, 1});
1150 /************************** Stage 3 **************************/
1151 /* N H W KH KW PH PW S D G GCin GCout */
1152 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 116, 1, 1});
1153 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 116, 116});
1154 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 116, 116});
1155 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 116, 1, 1});
1156 /************************** Stage 4 **************************/
1157 /* N H W KH KW PH PW S D G GCin GCout */
1158 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 232, 1, 1});
1159 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 232, 232});
1160 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 232, 232});
1161 b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 232, 1, 1});
1162 /*************************** Conv 5 **************************/
1163 /* N H W KH KW PH PW S D G GCin GCout */
1164 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 464, 1024});
1165}
1166
1167// ShuffleNet v2 (1.5X scale)
1168static void ShuffleNetV2X15(benchmark::internal::Benchmark* b) {
1169 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1170
1171 /*************************** Conv 1 **************************/
1172 /* N H W KH KW PH PW S D G GCin GCout */
1173 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
1174 /************************** Stage 2 **************************/
1175 /* N H W KH KW PH PW S D G GCin GCout */
1176 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});
1177 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 88});
1178 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 88});
1179 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 88, 1, 1});
1180 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 88, 88});
1181 b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 88, 1, 1});
1182 /************************** Stage 3 **************************/
1183 /* N H W KH KW PH PW S D G GCin GCout */
1184 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 176, 1, 1});
1185 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 176, 176});
1186 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 176, 176});
1187 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 176, 1, 1});
1188 /************************** Stage 4 **************************/
1189 /* N H W KH KW PH PW S D G GCin GCout */
1190 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 352, 1, 1});
1191 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 352, 352});
1192 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 352, 352});
1193 b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 352, 1, 1});
1194 /*************************** Conv 5 **************************/
1195 /* N H W KH KW PH PW S D G GCin GCout */
1196 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 704, 1024});
1197}
1198
1199// ShuffleNet v2 (2.0X scale)
1200static void ShuffleNetV2X20(benchmark::internal::Benchmark* b) {
1201 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1202
1203 /*************************** Conv 1 **************************/
1204 /* N H W KH KW PH PW S D G GCin GCout */
1205 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});
1206 /************************** Stage 2 **************************/
1207 /* N H W KH KW PH PW S D G GCin GCout */
1208 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});
1209 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 122});
1210 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 122});
1211 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 122, 1, 1});
1212 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 122, 122});
1213 b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 122, 1, 1});
1214 /************************** Stage 3 **************************/
1215 /* N H W KH KW PH PW S D G GCin GCout */
1216 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 244, 1, 1});
1217 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 244, 244});
1218 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 244, 244});
1219 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 244, 1, 1});
1220 /************************** Stage 4 **************************/
1221 /* N H W KH KW PH PW S D G GCin GCout */
1222 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 488, 1, 1});
1223 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 488, 488});
1224 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 488, 488});
1225 b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 488, 1, 1});
1226 /*************************** Conv 5 **************************/
1227 /* N H W KH KW PH PW S D G GCin GCout */
1228 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 976, 2048});
1229}
1230
1231static void MobileNetV1(benchmark::internal::Benchmark* b) {
1232 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1233
1234 /* N H W KH KW PH PW S D G GCin GCout */
1235 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 32});
1236 b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 32, 1, 1});
1237 b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 32, 64});
1238 b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 64, 1, 1});
1239 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 128});
1240 b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 128, 1, 1});
1241 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 128, 128});
1242 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 128, 1, 1});
1243 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 128, 256});
1244 b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 256, 1, 1});
1245 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 256, 256});
1246 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 256, 1, 1});
1247 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 256, 512});
1248 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 512, 1, 1});
1249 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 512, 512});
1250 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 512, 1, 1});
1251 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 512, 1024});
1252 b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 1024, 1, 1});
1253 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 1024, 1024});
1254}
1255
1256static void MobileNetV2(benchmark::internal::Benchmark* b) {
1257 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1258
1259 /* N H W KH KW PH PW S D G GCin GCout */
1260 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 32});
1261
1262 /************************ Bottleneck 1 ***********************/
1263 /* N H W KH KW PH PW S D G GCin GCout */
1264 b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 32, 1, 1});
1265 b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 32, 16});
1266
1267 /************************ Bottleneck 2 ***********************/
1268 /* N H W KH KW PH PW S D G GCin GCout */
1269 b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 16, 96});
1270 b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 96, 1, 1});
1271 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 96, 24});
1272 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 144});
1273 b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 144, 1, 1});
1274 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 144, 24});
1275
1276 /************************ Bottleneck 3 ***********************/
1277 /* N H W KH KW PH PW S D G GCin GCout */
1278//b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 144});
1279 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 144, 1, 1});
1280 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 144, 32});
1281 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 32, 192});
1282 b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 192, 1, 1});
1283 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 192, 32});
1284//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 32, 192});
1285//b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 192, 1, 1});
1286//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 192, 32});
1287
1288 /************************ Bottleneck 4 ***********************/
1289 /* N H W KH KW PH PW S D G GCin GCout */
1290//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 32, 192});
1291 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 192, 1, 1});
1292 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 192, 64});
1293 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});
1294 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});
1295 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 64});
1296//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});
1297//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});
1298//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 64});
1299//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});
1300//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});
1301//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 64});
1302
1303 /************************ Bottleneck 5 ***********************/
1304 /* N H W KH KW PH PW S D G GCin GCout */
1305//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});
1306//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});
1307 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 96});
1308 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 576});
1309 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 576, 1, 1});
1310 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 576, 96});
1311//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 576});
1312//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 576, 1, 1});
1313//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 576, 96});
1314
1315 /************************ Bottleneck 6 ***********************/
1316 /* N H W KH KW PH PW S D G GCin GCout */
1317//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 576});
1318 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 576, 1, 1});
1319 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 160});
1320 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
1321 b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 960, 1, 1});
1322 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});
1323//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
1324//b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 960, 1, 1});
1325//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});
1326
1327 /************************ Bottleneck 7 ***********************/
1328 /* N H W KH KW PH PW S D G GCin GCout */
1329//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
1330//b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 960, 1, 1});
1331 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 320});
1332
1333 /******************** Pre-pooling Conv2D *********************/
1334 /* N H W KH KW PH PW S D G GCin GCout */
1335 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 320, 1280});
1336 /******************** Post-pooling Conv2D ********************/
1337 /* N H W KH KW PH PW S D G GCin GCout */
1338 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1280, 1000});
1339}
1340
1341static void MobileNetV3Small(benchmark::internal::Benchmark* b) {
1342 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1343
1344 /*********************** Initial Stage ***********************/
1345 /* N H W KH KW PH PW S D G GCin GCout */
1346 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 16});
1347 /*********************** Bottleneck 1 ************************/
1348 /* N H W KH KW PH PW S D G GCin GCout */
1349 b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 16, 1, 1});
1350 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 16, 8});
1351 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 8, 16});
1352 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 16, 16});
1353 /*********************** Bottleneck 2 ************************/
1354 /* N H W KH KW PH PW S D G GCin GCout */
1355 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 16, 72});
1356 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 72, 1, 1});
1357 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 72, 24});
1358 /*********************** Bottleneck 3 ************************/
1359 /* N H W KH KW PH PW S D G GCin GCout */
1360 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 88});
1361 b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 88, 1, 1});
1362 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 88, 24});
1363 /*********************** Bottleneck 4 ************************/
1364 /* N H W KH KW PH PW S D G GCin GCout */
1365 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 96});
1366 b->Args({1, 28, 28, 5, 5, 4, 4, 2, 1, 96, 1, 1});
1367 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 96, 24});
1368 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 24, 96});
1369 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 40});
1370 /*********************** Bottleneck 5 ************************/
1371 /* N H W KH KW PH PW S D G GCin GCout */
1372 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 40, 240});
1373 b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 240, 1, 1});
1374 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 64});
1375 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 64, 240});
1376 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 240, 40});
1377 /*********************** Bottleneck 6 ************************/
1378 /* N H W KH KW PH PW S D G GCin GCout */
1379//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 40, 240});
1380//b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 240, 1, 1});
1381//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 64});
1382//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 64, 240});
1383//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 240, 40});
1384 /*********************** Bottleneck 7 ************************/
1385 /* N H W KH KW PH PW S D G GCin GCout */
1386 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 40, 120});
1387 b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 120, 1, 1});
1388 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 32});
1389 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 32, 120});
1390 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 120, 48});
1391 /*********************** Bottleneck 8 ************************/
1392 /* N H W KH KW PH PW S D G GCin GCout */
1393 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 48, 144});
1394 b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 144, 1, 1});
1395 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 144, 40});
1396 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 40, 144});
1397 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 144, 48});
1398 /*********************** Bottleneck 9 ************************/
1399 /* N H W KH KW PH PW S D G GCin GCout */
1400 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 48, 288});
1401 b->Args({1, 14, 14, 5, 5, 4, 4, 2, 1, 288, 1, 1});
1402 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 288, 72});
1403 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 72, 288});
1404 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 288, 96});
1405 /*********************** Bottleneck 10 ***********************/
1406 /* N H W KH KW PH PW S D G GCin GCout */
1407 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 576});
1408 b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 576, 1, 1});
1409 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 576, 144});
1410 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 144, 576});
1411 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 96});
1412 /*********************** Bottleneck 11 ***********************/
1413 /* N H W KH KW PH PW S D G GCin GCout */
1414//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 576});
1415//b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 576, 1, 1});
1416//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 576, 144});
1417//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 144, 576});
1418//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 96});
1419 /************************ Last Stage ************************/
1420 /* N H W KH KW PH PW S D G GCin GCout */
1421//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 576});
1422 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 576, 1024});
1423 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1024, 1001});
1424}
1425
1426static void MobileNetV3Large(benchmark::internal::Benchmark* b) {
1427 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1428
1429 /*********************** Initial Stage ***********************/
1430 /* N H W KH KW PH PW S D G GCin GCout */
1431 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 16});
1432 /*********************** Bottleneck 1 ************************/
1433 /* N H W KH KW PH PW S D G GCin GCout */
1434 b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 16, 1, 1});
1435 b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 16, 16});
1436 /*********************** Bottleneck 2 ************************/
1437 /* N H W KH KW PH PW S D G GCin GCout */
1438 b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 16, 64});
1439 b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 64, 1, 1});
1440 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 24});
1441 /*********************** Bottleneck 3 ************************/
1442 /* N H W KH KW PH PW S D G GCin GCout */
1443 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 72});
1444 b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 72, 1, 1});
1445 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 72, 24});
1446 /*********************** Bottleneck 4 ************************/
1447 /* N H W KH KW PH PW S D G GCin GCout */
1448//b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 72});
1449 b->Args({1, 56, 56, 5, 5, 4, 4, 2, 1, 72, 1, 1});
1450 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 72, 24});
1451 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 24, 72});
1452 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 72, 40});
1453 /*********************** Bottleneck 5 ************************/
1454 /* N H W KH KW PH PW S D G GCin GCout */
1455 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 40, 120});
1456 b->Args({1, 28, 28, 5, 5, 4, 4, 1, 1, 120, 1, 1});
1457 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 32});
1458 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 32, 120});
1459 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 120, 40});
1460 /*********************** Bottleneck 6 ************************/
1461 /* N H W KH KW PH PW S D G GCin GCout */
1462//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 40, 120});
1463//b->Args({1, 28, 28, 5, 5, 4, 4, 1, 1, 120, 1, 1});
1464//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 32});
1465//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 32, 120});
1466//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 120, 40});
1467 /*********************** Bottleneck 7 ************************/
1468 /* N H W KH KW PH PW S D G GCin GCout */
1469 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 40, 240});
1470 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 240, 1, 1});
1471 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 240, 80});
1472 /*********************** Bottleneck 8 ************************/
1473 /* N H W KH KW PH PW S D G GCin GCout */
1474 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 200});
1475 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 200, 1, 1});
1476 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 200, 80});
1477 /*********************** Bottleneck 9 ************************/
1478 /* N H W KH KW PH PW S D G GCin GCout */
1479 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 184});
1480 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 184, 1, 1});
1481 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 184, 80});
1482 /********************** Bottleneck 10 ***********************/
1483 /* N H W KH KW PH PW S D G GCin GCout */
1484//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 184});
1485//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 184, 1, 1});
1486//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 184, 80});
1487 /********************** Bottleneck 11 ***********************/
1488 /* N H W KH KW PH PW S D G GCin GCout */
1489 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 480});
1490 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 480, 1, 1});
1491 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 480, 120});
1492 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 480});
1493 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 480, 112});
1494 /********************** Bottleneck 12 ***********************/
1495 /* N H W KH KW PH PW S D G GCin GCout */
1496 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 112, 672});
1497 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 672, 1, 1});
1498 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 672, 168});
1499 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 168, 672});
1500 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 672, 112});
1501 /********************** Bottleneck 13 ***********************/
1502 /* N H W KH KW PH PW S D G GCin GCout */
1503//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 112, 672});
1504 b->Args({1, 14, 14, 5, 5, 4, 4, 2, 1, 672, 1, 1});
1505 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 672, 160});
1506 /********************** Bottleneck 14 ***********************/
1507 /* N H W KH KW PH PW S D G GCin GCout */
1508 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
1509 b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 960, 1, 1});
1510 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 960, 240});
1511 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 960});
1512 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});
1513 /********************** Bottleneck 15 ***********************/
1514 /* N H W KH KW PH PW S D G GCin GCout */
1515//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
1516//b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 960, 1, 1});
1517//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 960, 240});
1518//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 960});
1519//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});
1520 /************************ Last Stage ***********************/
1521 /* N H W KH KW PH PW S D G GCin GCout */
1522//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});
1523 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 960, 1280});
1524 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1280, 1001});
1525}
1526
1527// SqueezeNet 1.0
1528static void SqueezeNetV10(benchmark::internal::Benchmark* b) {
1529 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1530
1531 /************************** Conv 1 *************************/
1532 /* N H W KH KW PH PW S D G GCin GCout */
1533 b->Args({1, 224, 224, 7, 7, 6, 6, 2, 1, 1, 3, 96});
1534 /************************** Fire 2 *************************/
1535 /* N H W KH KW PH PW S D G GCin GCout */
1536 b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 96, 16});
1537 b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});
1538 b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});
1539 /************************** Fire 3 *************************/
1540 /* N H W KH KW PH PW S D G GCin GCout */
1541 b->Args({1, 56, 55, 1, 1, 0, 0, 1, 1, 1, 128, 16});
1542//b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});
1543//b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});
1544 /************************** Fire 4 *************************/
1545 /* N H W KH KW PH PW S D G GCin GCout */
1546 b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 128, 32});
1547 b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 32, 128});
1548 b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 32, 128});
1549 /************************** Fire 5 *************************/
1550 /* N H W KH KW PH PW S D G GCin GCout */
1551 b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 256, 32});
1552 b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 32, 128});
1553 b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 32, 128});
1554 /************************** Fire 6 *************************/
1555 /* N H W KH KW PH PW S D G GCin GCout */
1556 b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 256, 48});
1557 b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 48, 192});
1558 b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 48, 192});
1559 /************************** Fire 7 *************************/
1560 /* N H W KH KW PH PW S D G GCin GCout */
1561 b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 384, 48});
1562//b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 48, 192});
1563//b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 48, 192});
1564 /************************** Fire 8 *************************/
1565 /* N H W KH KW PH PW S D G GCin GCout */
1566 b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 384, 64});
1567 b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1568 b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 64, 256});
1569 /************************** Fire 9 *************************/
1570 /* N H W KH KW PH PW S D G GCin GCout */
1571 b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 64});
1572 b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1573 b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 64, 256});
1574 /************************* Conv 10 *************************/
1575 /* N H W KH KW PH PW S D G GCin GCout */
1576 b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 1000});
1577}
1578
1579// SqueezeNet 1.1
1580static void SqueezeNetV11(benchmark::internal::Benchmark* b) {
1581 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1582
1583 /************************** Conv 1 *************************/
1584 /* N H W KH KW PH PW S D G GCin GCout */
1585 b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 64});
1586 /************************** Fire 2 *************************/
1587 /* N H W KH KW PH PW S D G GCin GCout */
1588 b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 64, 16});
1589 b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});
1590 b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});
1591 /************************** Fire 3 *************************/
1592 /* N H W KH KW PH PW S D G GCin GCout */
1593 b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 128, 16});
1594//b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});
1595//b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});
1596 /************************** Fire 4 *************************/
1597 /* N H W KH KW PH PW S D G GCin GCout */
1598 b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 128, 32});
1599 b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 32, 128});
1600 b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 32, 128});
1601 /************************** Fire 5 *************************/
1602 /* N H W KH KW PH PW S D G GCin GCout */
1603 b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 256, 32});
1604//b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 32, 128});
1605//b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 32, 128});
1606 /************************** Fire 6 *************************/
1607 /* N H W KH KW PH PW S D G GCin GCout */
1608 b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 256, 48});
1609 b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 48, 192});
1610 b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 48, 192});
1611 /************************** Fire 7 *************************/
1612 /* N H W KH KW PH PW S D G GCin GCout */
1613 b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 384, 48});
1614//b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 48, 192});
1615//b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 48, 192});
1616 /************************** Fire 8 *************************/
1617 /* N H W KH KW PH PW S D G GCin GCout */
1618 b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 384, 64});
1619 b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1620 b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 64, 256});
1621 /************************** Fire 9 *************************/
1622 /* N H W KH KW PH PW S D G GCin GCout */
1623 b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 64});
1624//b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1625//b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 64, 256});
1626 /************************* Conv 10 *************************/
1627 /* N H W KH KW PH PW S D G GCin GCout */
1628 b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 1000});
1629}
1630
1631static void InceptionV3(benchmark::internal::Benchmark* b) {
1632 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1633
1634 /* N H W KH KW PH PW S D G GCin GCout */
1635 b->Args({1, 299, 299, 3, 3, 0, 0, 2, 1, 1, 3, 32});
1636 b->Args({1, 149, 149, 3, 3, 0, 0, 1, 1, 1, 32, 32});
1637 b->Args({1, 147, 147, 3, 3, 2, 2, 1, 1, 1, 32, 64});
1638 b->Args({1, 73, 73, 1, 1, 0, 0, 1, 1, 1, 64, 80});
1639 b->Args({1, 73, 73, 3, 3, 0, 0, 1, 1, 1, 80, 192});
1640 b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 192, 64});
1641 b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 192, 48});
1642 b->Args({1, 35, 35, 5, 5, 4, 4, 1, 1, 1, 48, 64});
1643 b->Args({1, 35, 35, 3, 3, 2, 2, 1, 1, 1, 64, 96});
1644 b->Args({1, 35, 35, 3, 3, 2, 2, 1, 1, 1, 96, 96});
1645 b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 192, 32});
1646 b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 256, 64});
1647 b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 256, 48});
1648 b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 288, 64});
1649 b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 288, 48});
1650 b->Args({1, 35, 35, 3, 3, 0, 0, 2, 1, 1, 288, 384});
1651 b->Args({1, 35, 35, 3, 3, 0, 0, 2, 1, 1, 96, 96});
1652 b->Args({1, 17, 17, 1, 1, 0, 0, 1, 1, 1, 768, 192});
1653 b->Args({1, 17, 17, 1, 1, 0, 0, 1, 1, 1, 768, 128});
1654 b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 128, 128});
1655 b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 128, 192});
1656 b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 128, 128});
1657 b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 128, 192});
1658 b->Args({1, 17, 17, 1, 1, 0, 0, 1, 1, 1, 768, 160});
1659 b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 160, 160});
1660 b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 160, 192});
1661 b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 160, 160});
1662 b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 160, 192});
1663 b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 192, 192});
1664 b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 192, 192});
1665 b->Args({1, 17, 17, 3, 3, 0, 0, 2, 1, 1, 192, 320});
1666 b->Args({1, 17, 17, 3, 3, 0, 0, 2, 1, 1, 192, 192});
1667 b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 320});
1668 b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 384});
1669 b->Args({1, 8, 8, 1, 3, 0, 2, 1, 1, 1, 384, 384});
1670 b->Args({1, 8, 8, 3, 1, 2, 0, 1, 1, 1, 384, 384});
1671 b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 448});
1672 b->Args({1, 8, 8, 3, 3, 2, 2, 1, 1, 1, 448, 384});
1673 b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 192});
1674 b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 320});
1675 b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 384});
1676 b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 448});
1677 b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 192});
1678 b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 2048, 1001});
1679}
1680
1681static void ResNet18(benchmark::internal::Benchmark* b) {
1682 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1683
1684 /************************* Conv 1 *************************/
1685 /* N H W KH KW PH PW S D G GCin GCout */
1686 b->Args({1, 224, 224, 7, 7, 6, 6, 2, 1, 1, 3, 64});
1687 /************************ Conv 2.X ************************/
1688 /* N H W KH KW PH PW S D G GCin GCout */
1689 b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 64, 64});
1690 /************************ Conv 3.X ************************/
1691 /* N H W KH KW PH PW S D G GCin GCout */
1692 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 1, 64, 128});
1693 b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 128, 128});
1694 b->Args({1, 56, 56, 1, 1, 0, 0, 2, 1, 1, 64, 128});
1695 /************************ Conv 4.X ************************/
1696 /* N H W KH KW PH PW S D G GCin GCout */
1697 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 1, 128, 256});
1698 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 1, 256, 256});
1699 b->Args({1, 28, 28, 1, 1, 0, 0, 2, 1, 1, 128, 256});
1700 /************************ Conv 5.X ************************/
1701 /* N H W KH KW PH PW S D G GCin GCout */
1702 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 1, 256, 512});
1703 b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 1, 512, 512});
1704 b->Args({1, 14, 14, 1, 1, 0, 0, 2, 1, 1, 256, 512});
1705}
1706
1707static void ResNet50(benchmark::internal::Benchmark* b) {
1708 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1709
1710 /************************* Conv 1 *************************/
1711 /* N H W KH KW PH PW S D G GCin GCout */
1712 b->Args({1, 224, 224, 7, 7, 6, 6, 2, 1, 1, 3, 64});
1713 /************************ Conv 2.1 ************************/
1714 /* N H W KH KW PH PW S D G GCin GCout */
1715 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 64});
1716 b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 64, 64});
1717 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1718//b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1719 /************************ Conv 2.X ************************/
1720 /* N H W KH KW PH PW S D G GCin GCout */
1721 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 256, 64});
1722//b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 64, 64});
1723//b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 256});
1724 /************************ Conv 3.1 ************************/
1725 /* N H W KH KW PH PW S D G GCin GCout */
1726 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 256, 128});
1727 b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 1, 128, 128});
1728 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 128, 512});
1729 b->Args({1, 56, 56, 1, 1, 0, 0, 2, 1, 1, 256, 512});
1730 /************************ Conv 3.X ************************/
1731 /* N H W KH KW PH PW S D G GCin GCout */
1732 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 512, 128});
1733 b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 128, 128});
1734//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 128, 512});
1735 /************************ Conv 4.1 ************************/
1736 /* N H W KH KW PH PW S D G GCin GCout */
1737 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 512, 256});
1738 b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 1, 256, 256});
1739 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 256, 1024});
1740 b->Args({1, 28, 28, 1, 1, 0, 0, 2, 1, 1, 512, 1024});
1741 /************************ Conv 4.X ************************/
1742 /* N H W KH KW PH PW S D G GCin GCout */
1743 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 1024, 256});
1744 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 1, 256, 256});
1745//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 256, 1024});
1746 /************************ Conv 5.1 ************************/
1747 /* N H W KH KW PH PW S D G GCin GCout */
1748 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 1024, 512});
1749 b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 1, 512, 512});
1750 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 512, 2048});
1751 b->Args({1, 14, 14, 1, 1, 0, 0, 2, 1, 1, 1024, 2048});
1752 /************************ Conv 5.X ************************/
1753 /* N H W KH KW PH PW S D G GCin GCout */
1754 b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 2048, 512});
1755 b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 1, 512, 512});
1756//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 512, 2048});
1757}
1758
1759static void VGG(benchmark::internal::Benchmark* b) {
1760 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1761
1762 /************************* Conv 1.1 ************************/
1763 /* N H W KH KW PH PW S D G GCin GCout */
1764 b->Args({1, 224, 224, 3, 3, 2, 2, 1, 1, 1, 3, 64});
1765 /************************* Conv 1.2 ************************/
1766 /* N H W KH KW PH PW S D G GCin GCout */
1767 b->Args({1, 224, 224, 3, 3, 2, 2, 1, 1, 1, 64, 64});
1768
1769 /************************* Conv 2.1 ************************/
1770 /* N H W KH KW PH PW S D G GCin GCout */
1771 b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 1, 64, 128});
1772 /************************* Conv 2.2 ************************/
1773 /* N H W KH KW PH PW S D G GCin GCout */
1774 b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 1, 128, 128});
1775
1776 /************************* Conv 3.1 ************************/
1777 /* N H W KH KW PH PW S D G GCin GCout */
1778 b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 128, 256});
1779 /************************* Conv 3.2 ************************/
1780 /* N H W KH KW PH PW S D G GCin GCout */
1781 b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 256, 256});
1782 /************************* Conv 3.3 ************************/
1783 /* N H W KH KW PH PW S D G GCin GCout */
1784 b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 256, 256});
1785
1786 /************************* Conv 4.1 ************************/
1787 /* N H W KH KW PH PW S D G GCin GCout */
1788 b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 256, 512});
1789 /************************* Conv 4.2 ************************/
1790 /* N H W KH KW PH PW S D G GCin GCout */
1791 b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 512, 512});
1792 /************************* Conv 4.3 ************************/
1793 /* N H W KH KW PH PW S D G GCin GCout */
1794 b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 512, 512});
1795
1796 /************************* Conv 5.X ************************/
1797 /* N H W KH KW PH PW S D G GCin GCout */
1798 b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 1, 512, 512});
1799 /************************* Conv 5.3 ************************/
1800 /* N H W KH KW PH PW S D G GCin GCout */
1801 b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 512, 512});
1802}
1803
1804// SRCNN (9-1-5)
1805static void SRCNN915(benchmark::internal::Benchmark* b) {
1806 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1807
1808 /* N H W KH KW PH PW S D G GCin GCout */
1809 b->Args({1, 384, 384, 9, 9, 0, 0, 1, 1, 1, 1, 64});
1810 b->Args({1, 376, 376, 1, 1, 0, 0, 1, 1, 1, 64, 32});
1811 b->Args({1, 376, 376, 5, 5, 0, 0, 1, 1, 1, 32, 1});
1812}
1813
1814// SRCNN (9-3-5)
1815static void SRCNN935(benchmark::internal::Benchmark* b) {
1816 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1817
1818 /* N H W KH KW PH PW S D G GCin GCout */
1819 b->Args({1, 384, 384, 9, 9, 0, 0, 1, 1, 1, 1, 64});
1820 b->Args({1, 376, 376, 3, 3, 0, 0, 1, 1, 1, 64, 32});
1821 b->Args({1, 374, 374, 5, 5, 0, 0, 1, 1, 1, 32, 1});
1822}
1823
1824// SRCNN (9-5-5)
1825static void SRCNN955(benchmark::internal::Benchmark* b) {
1826 b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});
1827
1828 /* N H W KH KW PH PW S D G GCin GCout */
1829 b->Args({1, 384, 384, 9, 9, 0, 0, 1, 1, 1, 1, 64});
1830 b->Args({1, 376, 376, 5, 5, 0, 0, 1, 1, 1, 64, 32});
1831 b->Args({1, 372, 372, 5, 5, 0, 0, 1, 1, 1, 32, 1});
1832}
1833
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001834BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1835BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1836BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1837BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1838BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1839BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1840BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1841BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1842BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1843BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1844BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1845BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1846BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1847BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1848BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1849BENCHMARK_CAPTURE(xnnpack_convolution_f16, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1850BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1851BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1852BENCHMARK_CAPTURE(xnnpack_convolution_f16, vgg, "VGG")->Apply(VGG)->UseRealTime();
1853BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1854BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1855BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1856
XNNPACK Teamb455b122019-09-27 18:10:33 -07001857BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1858BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1859BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1860BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1861BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1862BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1863BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1864BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1865BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1866BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1867BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1868BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1869BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1870BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1871BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1872BENCHMARK_CAPTURE(xnnpack_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1873BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1874BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1875BENCHMARK_CAPTURE(xnnpack_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
1876BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1877BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1878BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1879
1880BENCHMARK_CAPTURE(xnnpack_convolution_q8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1881BENCHMARK_CAPTURE(xnnpack_convolution_q8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1882BENCHMARK_CAPTURE(xnnpack_convolution_q8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1883BENCHMARK_CAPTURE(xnnpack_convolution_q8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1884BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1885BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1886BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1887BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1888BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1889BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1890BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1891BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1892BENCHMARK_CAPTURE(xnnpack_convolution_q8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1893BENCHMARK_CAPTURE(xnnpack_convolution_q8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1894BENCHMARK_CAPTURE(xnnpack_convolution_q8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1895BENCHMARK_CAPTURE(xnnpack_convolution_q8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1896BENCHMARK_CAPTURE(xnnpack_convolution_q8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1897BENCHMARK_CAPTURE(xnnpack_convolution_q8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1898BENCHMARK_CAPTURE(xnnpack_convolution_q8, vgg, "VGG")->Apply(VGG)->UseRealTime();
1899BENCHMARK_CAPTURE(xnnpack_convolution_q8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1900BENCHMARK_CAPTURE(xnnpack_convolution_q8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1901BENCHMARK_CAPTURE(xnnpack_convolution_q8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1902
1903#ifdef BENCHMARK_TENSORFLOW_LITE
1904 BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1905 BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1906 BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();
1907 BENCHMARK_CAPTURE(tflite_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();
1908 BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1909 BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1910 BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1911 BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1912 BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1913 BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1914 BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1915 BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1916 BENCHMARK_CAPTURE(tflite_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1917 BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1918 BENCHMARK_CAPTURE(tflite_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1919 BENCHMARK_CAPTURE(tflite_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1920 BENCHMARK_CAPTURE(tflite_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1921 BENCHMARK_CAPTURE(tflite_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1922 BENCHMARK_CAPTURE(tflite_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
1923 BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1924 BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1925 BENCHMARK_CAPTURE(tflite_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1926#endif // BENCHMARK_TENSORFLOW_LITE
1927
1928#ifdef BENCHMARK_ARM_COMPUTE_LIBRARY
1929 BENCHMARK_CAPTURE(armcl_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();
1930 BENCHMARK_CAPTURE(armcl_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();
1931 BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();
1932 BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();
1933 BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();
1934 BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();
1935 BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();
1936 BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();
1937 BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();
1938 BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();
1939 BENCHMARK_CAPTURE(armcl_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();
1940 BENCHMARK_CAPTURE(armcl_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();
1941 BENCHMARK_CAPTURE(armcl_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();
1942 BENCHMARK_CAPTURE(armcl_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();
1943 BENCHMARK_CAPTURE(armcl_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();
1944 BENCHMARK_CAPTURE(armcl_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();
1945 BENCHMARK_CAPTURE(armcl_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();
1946 BENCHMARK_CAPTURE(armcl_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();
1947 BENCHMARK_CAPTURE(armcl_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();
1948 BENCHMARK_CAPTURE(armcl_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();
1949#endif // BENCHMARK_ARM_COMPUTE_LIBRARY
1950
1951#ifndef XNNPACK_BENCHMARK_NO_MAIN
1952BENCHMARK_MAIN();
1953#endif