Blame - bench/convolution.cc - platform/external/XNNPACK

2019-09-27 18:10:33 -0700

[diff] [blame]

1

// Copyright (c) Facebook, Inc. and its affiliates.

//

//

// This source code is licensed under the BSD-style license found in the

7

// LICENSE file in the root directory of this source tree.

#include <algorithm>

#include <cfloat>

#include <cmath>

#include <functional>

Marat Dukhan

5ce30d9

2020-04-14 03:31:26 -0700

[diff] [blame]

13

#include <limits>

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

#include <ostream>

#include <random>

#include <string>

#include <vector>

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

19

#include <xnnpack.h>

20

Frank Barchard

bb4c18b

2019-09-30 11:05:52 -0700

[diff] [blame]

21

#ifdef BENCHMARK_ARM_COMPUTE_LIBRARY

22

#include "arm_compute/core/Types.h"

23

#include "arm_compute/runtime/Tensor.h"

24

#include "arm_compute/runtime/CPP/CPPScheduler.h"

25

#include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"

26

#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"

27

#endif // BENCHMARK_ARM_COMPUTE_LIBRARY

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

28

#include <benchmark/benchmark.h>

Frank Barchard

2020-06-26 14:07:19 -0700

[diff] [blame]

29

#include <fp16.h>

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

30

#ifdef BENCHMARK_TENSORFLOW_LITE

31

#include "flatbuffers/include/flatbuffers/flatbuffers.h"

32

#include "tensorflow/lite/interpreter.h"

33

#include "tensorflow/lite/kernels/register.h"

34

#include "tensorflow/lite/model.h"

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

35

#include "tensorflow/lite/schema/schema_generated.h"

36

#include "tensorflow/lite/version.h"

37

#endif // BENCHMARK_TENSORFLOW_LITE

Frank Barchard

bb4c18b

2019-09-30 11:05:52 -0700

[diff] [blame]

38

#include "bench/utils.h"

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

39

Chao Mei

2020-07-23 09:35:11 -0700

[diff] [blame]

40

#ifndef XNN_NO_QU8_OPERATORS

Marat Dukhan

08b7a97

2020-07-14 18:17:29 -0700

[diff] [blame]

41

void xnnpack_convolution_qu8(benchmark::State& state, const char* net) {

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

42

const size_t batch_size = state.range(0);

43

const size_t input_height = state.range(1);

44

const size_t input_width = state.range(2);

45

const size_t kernel_height = state.range(3);

46

const size_t kernel_width = state.range(4);

47

const size_t padding_height = state.range(5);

48

const size_t padding_width = state.range(6);

49

const size_t subsampling = state.range(7);

50

const size_t dilation = state.range(8);

51

const size_t groups = state.range(9);

52

const size_t group_input_channels = state.range(10);

53

const size_t group_output_channels = state.range(11);

54

55

std::random_device random_device;

56

auto rng = std::mt19937(random_device());

Marat Dukhan

ecd8311

2020-08-03 21:50:28 -0700

[diff] [blame]

57

auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));

Marat Dukhan

2020-08-02 21:46:58 -0700

[diff] [blame]

58

auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

59

60

const size_t output_pixel_stride = groups * group_output_channels;

61

const size_t input_pixel_stride = groups * group_input_channels;

62

const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;

63

const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;

64

const size_t padding_left = padding_width / 2;

65

const size_t padding_top = padding_height / 2;

66

const size_t padding_right = padding_width - padding_left;

67

const size_t padding_bottom = padding_height - padding_top;

68

const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;

69

const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;

70

71

std::vector<uint8_t> input(batch_size * input_height * input_width * input_pixel_stride);

72

std::generate(input.begin(), input.end(), std::ref(u8rng));

73

std::vector<uint8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);

74

std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));

75

std::vector<int32_t> bias(groups * group_output_channels);

Marat Dukhan

ecd8311

2020-08-03 21:50:28 -0700

[diff] [blame]

76

std::generate(bias.begin(), bias.end(), std::ref(i32rng));

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

77

const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;

78

Marat Dukhan

04f03be

2019-11-19 12:36:47 -0800

[diff] [blame]

79

xnn_status status = xnn_initialize(nullptr /* allocator */);

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

80

if (status != xnn_status_success) {

81

state.SkipWithError("failed to initialize XNNPACK");

return;

}

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

85

const size_t num_buffers = 1 +

Marat Dukhan

2019-10-23 02:09:02 -0700

[diff] [blame]

86

benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

87

sizeof(uint8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(uint8_t) * output_elements);

88

std::vector<uint8_t> output(output_elements * num_buffers);

89

90

std::vector<xnn_operator_t> convolution_operators(num_buffers);

91

for (xnn_operator_t& convolution_op : convolution_operators) {

Marat Dukhan

08b7a97

2020-07-14 18:17:29 -0700

[diff] [blame]

92

status = xnn_create_convolution2d_nhwc_qu8(

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

93

padding_top, padding_right, padding_bottom, padding_left,

94

kernel_height, kernel_width,

95

subsampling, subsampling,

96

dilation, dilation,

97

groups, group_input_channels, group_output_channels,

98

input_pixel_stride, output_pixel_stride,

99

127, 0.5f,

100

127, 0.5f,

101

kernel.data(), bias.data(),

102

127, 0.5f, 0, 255,

103

0 /* flags */, &convolution_op);

104

if (status != xnn_status_success) {

Marat Dukhan

2020-08-04 16:38:22 -0700

[diff] [blame]

105

state.SkipWithError("failed to create QUINT8 Convolution operator");

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

return;

}

}

for (size_t i = 0; i < convolution_operators.size(); i++) {

Marat Dukhan

08b7a97

2020-07-14 18:17:29 -0700

[diff] [blame]

111

status = xnn_setup_convolution2d_nhwc_qu8(

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

112

convolution_operators[i],

113

batch_size, input_height, input_width,

114

input.data(), output.data() + i * output_elements,

115

nullptr /* thread pool */);

116

if (status != xnn_status_success) {

Marat Dukhan

2020-08-04 16:38:22 -0700

[diff] [blame]

117

state.SkipWithError("failed to setup QUINT8 Convolution operator");

return;

}

}

size_t buffer_index = 0;

123

for (auto _ : state) {

124

state.PauseTiming();

125

benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));

126

buffer_index = (buffer_index + 1) % num_buffers;

127

state.ResumeTiming();

128

129

status = xnn_run_operator(convolution_operators[buffer_index],

130

nullptr /* thread pool */);

131

if (status != xnn_status_success) {

132

state.SkipWithError("failed to run QUINT8 Convolution operator");

return;

}

}

for (xnn_operator_t& convolution_op : convolution_operators) {

138

status = xnn_delete_operator(convolution_op);

139

if (status != xnn_status_success) {

140

state.SkipWithError("failed to delete QUINT8 Convolution operator");

141

return;

142

}

143

convolution_op = nullptr;

144

}

145

Marat Dukhan

2020-12-04 14:23:12 -0800

[diff] [blame]

146

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();

147

if (cpu_frequency != 0) {

148

state.counters["cpufreq"] = cpu_frequency;

149

}

150

Marat Dukhan

2020-08-04 16:38:22 -0700

[diff] [blame]

151

state.counters["OPS"] = benchmark::Counter(

152

uint64_t(state.iterations()) * 2 *

153

batch_size * output_height * output_width *

154

groups * group_input_channels * group_output_channels *

155

kernel_height * kernel_width,

156

benchmark::Counter::kIsRate);

157

}

158

#endif // XNN_NO_QU8_OPERATORS

159

160

#ifndef XNN_NO_QS8_OPERATORS

161

void xnnpack_convolution_qs8(benchmark::State& state, const char* net) {

162

const size_t batch_size = state.range(0);

163

const size_t input_height = state.range(1);

164

const size_t input_width = state.range(2);

165

const size_t kernel_height = state.range(3);

166

const size_t kernel_width = state.range(4);

167

const size_t padding_height = state.range(5);

168

const size_t padding_width = state.range(6);

169

const size_t subsampling = state.range(7);

170

const size_t dilation = state.range(8);

171

const size_t groups = state.range(9);

172

const size_t group_input_channels = state.range(10);

173

const size_t group_output_channels = state.range(11);

174

175

std::random_device random_device;

176

auto rng = std::mt19937(random_device());

177

auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));

178

auto i8rng = std::bind(

179

std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));

180

181

const size_t output_pixel_stride = groups * group_output_channels;

182

const size_t input_pixel_stride = groups * group_input_channels;

183

const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;

184

const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;

185

const size_t padding_left = padding_width / 2;

186

const size_t padding_top = padding_height / 2;

187

const size_t padding_right = padding_width - padding_left;

188

const size_t padding_bottom = padding_height - padding_top;

189

const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;

190

const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;

191

192

std::vector<int8_t> input(batch_size * input_height * input_width * input_pixel_stride);

193

std::generate(input.begin(), input.end(), std::ref(i8rng));

194

std::vector<int8_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);

195

std::generate(kernel.begin(), kernel.end(), std::ref(i8rng));

196

std::vector<int32_t> bias(groups * group_output_channels);

197

std::generate(bias.begin(), bias.end(), std::ref(i32rng));

198

const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;

199

200

xnn_status status = xnn_initialize(nullptr /* allocator */);

201

if (status != xnn_status_success) {

202

state.SkipWithError("failed to initialize XNNPACK");

return;

}

const size_t num_buffers = 1 +

207

benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),

208

sizeof(int8_t) * kernel.size() + sizeof(int32_t) * bias.size() + sizeof(int8_t) * output_elements);

209

std::vector<int8_t> output(output_elements * num_buffers);

210

211

std::vector<xnn_operator_t> convolution_operators(num_buffers);

212

for (xnn_operator_t& convolution_op : convolution_operators) {

213

status = xnn_create_convolution2d_nhwc_qs8(

214

padding_top, padding_right, padding_bottom, padding_left,

215

kernel_height, kernel_width,

216

subsampling, subsampling,

217

dilation, dilation,

218

groups, group_input_channels, group_output_channels,

219

input_pixel_stride, output_pixel_stride,

220

127, 0.5f, 0.5f,

221

kernel.data(), bias.data(),

222

127, 0.5f, -128, 127,

223

0 /* flags */, &convolution_op);

224

if (status != xnn_status_success) {

225

state.SkipWithError("failed to create QINT8 Convolution operator");

return;

}

}

for (size_t i = 0; i < convolution_operators.size(); i++) {

231

status = xnn_setup_convolution2d_nhwc_qs8(

232

convolution_operators[i],

233

batch_size, input_height, input_width,

234

input.data(), output.data() + i * output_elements,

235

nullptr /* thread pool */);

236

if (status != xnn_status_success) {

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

237

state.SkipWithError("failed to setup QINT8 Convolution operator");

return;

}

}

size_t buffer_index = 0;

243

for (auto _ : state) {

244

state.PauseTiming();

Marat Dukhan

2019-10-23 02:09:02 -0700

[diff] [blame]

245

benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint8_t));

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

246

buffer_index = (buffer_index + 1) % num_buffers;

247

state.ResumeTiming();

248

249

status = xnn_run_operator(convolution_operators[buffer_index],

250

nullptr /* thread pool */);

251

if (status != xnn_status_success) {

252

state.SkipWithError("failed to run QINT8 Convolution operator");

return;

}

}

for (xnn_operator_t& convolution_op : convolution_operators) {

258

status = xnn_delete_operator(convolution_op);

259

if (status != xnn_status_success) {

260

state.SkipWithError("failed to delete QINT8 Convolution operator");

261

return;

262

}

263

convolution_op = nullptr;

264

}

265

Marat Dukhan

2020-12-04 14:23:12 -0800

[diff] [blame]

266

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();

267

if (cpu_frequency != 0) {

268

state.counters["cpufreq"] = cpu_frequency;

269

}

270

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

271

state.counters["OPS"] = benchmark::Counter(

272

uint64_t(state.iterations()) * 2 *

273

batch_size * output_height * output_width *

274

groups * group_input_channels * group_output_channels *

275

kernel_height * kernel_width,

276

benchmark::Counter::kIsRate);

277

}

Marat Dukhan

2020-08-04 16:38:22 -0700

[diff] [blame]

278

#endif // XNN_NO_QS8_OPERATORS

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

279

Chao Mei

2020-07-23 09:35:11 -0700

[diff] [blame]

280

#ifndef XNN_NO_F16_OPERATORS

Frank Barchard

2020-06-26 14:07:19 -0700

[diff] [blame]

281

void xnnpack_convolution_f16(benchmark::State& state, const char* net) {

282

if (!benchmark::utils::CheckNEONFP16ARITH(state)) {

283

return;

284

}

285

const size_t batch_size = state.range(0);

286

const size_t input_height = state.range(1);

287

const size_t input_width = state.range(2);

288

const size_t kernel_height = state.range(3);

289

const size_t kernel_width = state.range(4);

290

const size_t padding_height = state.range(5);

291

const size_t padding_width = state.range(6);

292

const size_t subsampling = state.range(7);

293

const size_t dilation = state.range(8);

294

const size_t groups = state.range(9);

295

const size_t group_input_channels = state.range(10);

296

const size_t group_output_channels = state.range(11);

297

298

std::random_device random_device;

299

auto rng = std::mt19937(random_device());

Marat Dukhan

2020-08-02 21:46:58 -0700

[diff] [blame]

300

auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), std::ref(rng));

Frank Barchard

2020-06-26 14:07:19 -0700

[diff] [blame]

301

auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);

302

303

const size_t output_pixel_stride = groups * group_output_channels;

304

const size_t input_pixel_stride = groups * group_input_channels;

305

const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;

306

const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;

307

const size_t padding_left = padding_width / 2;

308

const size_t padding_top = padding_height / 2;

309

const size_t padding_right = padding_width - padding_left;

310

const size_t padding_bottom = padding_height - padding_top;

311

const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;

312

const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;

313

314

std::vector<uint16_t> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(uint16_t));

315

std::generate(input.begin(), input.end(), std::ref(f16rng));

316

std::vector<uint16_t> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);

317

std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));

318

std::vector<uint16_t> bias(groups * group_output_channels);

319

std::generate(bias.begin(), bias.end(), std::ref(f16rng));

320

const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;

321

322

xnn_status status = xnn_initialize(nullptr /* allocator */);

323

if (status != xnn_status_success) {

324

state.SkipWithError("failed to initialize XNNPACK");

return;

}

const size_t num_buffers = 1 +

329

benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),

330

sizeof(uint16_t) * (kernel.size() + bias.size() + output_elements));

331

std::vector<uint16_t> output(output_elements * num_buffers);

332

333

std::vector<xnn_operator_t> convolution_operators(num_buffers);

334

for (xnn_operator_t& convolution_op : convolution_operators) {

335

status = xnn_create_convolution2d_nhwc_f16(

336

padding_top, padding_right, padding_bottom, padding_left,

337

kernel_height, kernel_width,

338

subsampling, subsampling,

339

dilation, dilation,

340

groups, group_input_channels, group_output_channels,

341

input_pixel_stride, output_pixel_stride,

342

kernel.data(), bias.data(),

343

-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),

344

0 /* flags */, &convolution_op);

345

if (status != xnn_status_success) {

346

state.SkipWithError("failed to create FP16 Convolution operator");

return;

}

}

for (size_t i = 0; i < convolution_operators.size(); i++) {

352

status = xnn_setup_convolution2d_nhwc_f16(

353

convolution_operators[i],

354

batch_size, input_height, input_width,

355

input.data(), output.data() + i * output_elements,

356

nullptr /* thread pool */);

357

if (status != xnn_status_success) {

358

state.SkipWithError("failed to setup FP16 Convolution operator");

return;

}

}

size_t buffer_index = 0;

364

for (auto _ : state) {

365

state.PauseTiming();

366

benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(uint16_t));

367

buffer_index = (buffer_index + 1) % num_buffers;

368

state.ResumeTiming();

369

370

status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);

371

if (status != xnn_status_success) {

372

state.SkipWithError("failed to run FP16 Convolution operator");

return;

}

}

for (xnn_operator_t& convolution_op : convolution_operators) {

378

status = xnn_delete_operator(convolution_op);

379

if (status != xnn_status_success) {

380

state.SkipWithError("failed to delete FP16 Convolution operator");

381

return;

382

}

383

convolution_op = nullptr;

384

}

385

Marat Dukhan

2020-12-04 14:23:12 -0800

[diff] [blame]

386

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();

387

if (cpu_frequency != 0) {

388

state.counters["cpufreq"] = cpu_frequency;

389

}

390

Frank Barchard

2020-06-26 14:07:19 -0700

[diff] [blame]

391

state.counters["FLOPS"] = benchmark::Counter(

392

uint64_t(state.iterations()) * 2 *

393

batch_size * output_height * output_width *

394

groups * group_input_channels * group_output_channels *

395

kernel_height * kernel_width,

396

benchmark::Counter::kIsRate);

397

}

Chao Mei

2020-07-23 09:35:11 -0700

[diff] [blame]

398

#endif // XNN_NO_F16_OPERATORS

Frank Barchard

2020-06-26 14:07:19 -0700

[diff] [blame]

399

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

400

void xnnpack_convolution_f32(benchmark::State& state, const char* net) {

401

const size_t batch_size = state.range(0);

402

const size_t input_height = state.range(1);

403

const size_t input_width = state.range(2);

404

const size_t kernel_height = state.range(3);

405

const size_t kernel_width = state.range(4);

406

const size_t padding_height = state.range(5);

407

const size_t padding_width = state.range(6);

408

const size_t subsampling = state.range(7);

409

const size_t dilation = state.range(8);

410

const size_t groups = state.range(9);

411

const size_t group_input_channels = state.range(10);

412

const size_t group_output_channels = state.range(11);

413

414

std::random_device random_device;

415

auto rng = std::mt19937(random_device());

Marat Dukhan

2020-08-02 21:46:58 -0700

[diff] [blame]

416

auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

417

418

const size_t output_pixel_stride = groups * group_output_channels;

419

const size_t input_pixel_stride = groups * group_input_channels;

420

const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;

421

const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;

422

const size_t padding_left = padding_width / 2;

423

const size_t padding_top = padding_height / 2;

424

const size_t padding_right = padding_width - padding_left;

425

const size_t padding_bottom = padding_height - padding_top;

426

const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;

427

const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;

428

429

std::vector<float> input(batch_size * input_height * input_width * input_pixel_stride + XNN_EXTRA_BYTES / sizeof(float));

430

std::generate(input.begin(), input.end(), std::ref(f32rng));

431

std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);

432

std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));

433

std::vector<float> bias(groups * group_output_channels);

434

std::generate(bias.begin(), bias.end(), std::ref(f32rng));

435

const size_t output_elements = batch_size * output_height * output_width * output_pixel_stride;

436

Marat Dukhan

04f03be

2019-11-19 12:36:47 -0800

[diff] [blame]

437

xnn_status status = xnn_initialize(nullptr /* allocator */);

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

438

if (status != xnn_status_success) {

439

state.SkipWithError("failed to initialize XNNPACK");

return;

}

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

443

const size_t num_buffers = 1 +

Marat Dukhan

2019-10-23 02:09:02 -0700

[diff] [blame]

444

benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

445

sizeof(float) * (kernel.size() + bias.size() + output_elements));

446

std::vector<float> output(output_elements * num_buffers);

447

448

std::vector<xnn_operator_t> convolution_operators(num_buffers);

449

for (xnn_operator_t& convolution_op : convolution_operators) {

450

status = xnn_create_convolution2d_nhwc_f32(

451

padding_top, padding_right, padding_bottom, padding_left,

452

kernel_height, kernel_width,

453

subsampling, subsampling,

454

dilation, dilation,

455

groups, group_input_channels, group_output_channels,

456

input_pixel_stride, output_pixel_stride,

457

kernel.data(), bias.data(),

458

-std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity(),

459

0 /* flags */, &convolution_op);

460

if (status != xnn_status_success) {

461

state.SkipWithError("failed to create FP32 Convolution operator");

return;

}

}

for (size_t i = 0; i < convolution_operators.size(); i++) {

467

status = xnn_setup_convolution2d_nhwc_f32(

468

convolution_operators[i],

469

batch_size, input_height, input_width,

470

input.data(), output.data() + i * output_elements,

471

nullptr /* thread pool */);

472

if (status != xnn_status_success) {

473

state.SkipWithError("failed to setup FP32 Convolution operator");

return;

}

}

size_t buffer_index = 0;

479

for (auto _ : state) {

480

state.PauseTiming();

Marat Dukhan

2019-10-23 02:09:02 -0700

[diff] [blame]

481

benchmark::utils::PrefetchToL1(input.data(), input.size() * sizeof(float));

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

482

buffer_index = (buffer_index + 1) % num_buffers;

483

state.ResumeTiming();

484

485

status = xnn_run_operator(convolution_operators[buffer_index], nullptr /* thread pool */);

486

if (status != xnn_status_success) {

487

state.SkipWithError("failed to run FP32 Convolution operator");

return;

}

}

for (xnn_operator_t& convolution_op : convolution_operators) {

493

status = xnn_delete_operator(convolution_op);

494

if (status != xnn_status_success) {

495

state.SkipWithError("failed to delete FP32 Convolution operator");

496

return;

497

}

498

convolution_op = nullptr;

499

}

500

Marat Dukhan

2020-12-04 14:23:12 -0800

[diff] [blame]

501

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();

502

if (cpu_frequency != 0) {

503

state.counters["cpufreq"] = cpu_frequency;

504

}

505

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

506

state.counters["FLOPS"] = benchmark::Counter(

507

uint64_t(state.iterations()) * 2 *

508

batch_size * output_height * output_width *

509

groups * group_input_channels * group_output_channels *

510

kernel_height * kernel_width,

511

benchmark::Counter::kIsRate);

512

}

513

514

#ifdef BENCHMARK_TENSORFLOW_LITE

515

void tflite_convolution_f32(benchmark::State& state, const char* net) {

516

const size_t batch_size = state.range(0);

517

const size_t input_height = state.range(1);

518

const size_t input_width = state.range(2);

519

const size_t kernel_height = state.range(3);

520

const size_t kernel_width = state.range(4);

521

const size_t padding_height = state.range(5);

522

const size_t padding_width = state.range(6);

523

const size_t subsampling = state.range(7);

524

const size_t dilation = state.range(8);

525

const size_t groups = state.range(9);

526

const size_t group_input_channels = state.range(10);

527

const size_t group_output_channels = state.range(11);

528

529

bool is_depthwise = false;

530

if (groups != 1) {

531

if (group_input_channels == 1) {

532

is_depthwise = true;

533

} else {

534

state.SkipWithError("grouped convolution is not supported");

return;

}

}

std::random_device random_device;

540

auto rng = std::mt19937(random_device());

Marat Dukhan

2020-08-02 21:46:58 -0700

[diff] [blame]

541

auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

542

543

const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;

544

const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;

545

546

tflite::Padding padding = tflite::Padding_VALID;

547

if (padding_width == (effective_kernel_width - 1) && padding_height == (effective_kernel_height - 1)) {

548

padding = tflite::Padding_SAME;

549

} else if (padding_width == 0 && padding_height == 0) {

550

padding = tflite::Padding_VALID;

551

} else {

552

state.SkipWithError("unsupported padding");

return;

}

const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;

557

const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;

558

559

std::vector<float> kernel(groups * group_output_channels * kernel_height * kernel_width * group_input_channels);

560

std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));

561

std::vector<float> bias(groups * group_output_channels);

562

std::generate(bias.begin(), bias.end(), std::ref(f32rng));

563

564

flatbuffers::FlatBufferBuilder builder;

565

flatbuffers::Offset<tflite::OperatorCode> operator_code =

566

CreateOperatorCode(

567

builder,

568

is_depthwise ? tflite::BuiltinOperator_DEPTHWISE_CONV_2D : tflite::BuiltinOperator_CONV_2D,

569

0);

570

571

flatbuffers::Offset<tflite::Conv2DOptions> conv2d_options = CreateConv2DOptions(

572

builder,

573

padding,

574

static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),

575

tflite::ActivationFunctionType_NONE,

576

static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));

577

578

flatbuffers::Offset<tflite::DepthwiseConv2DOptions> dwconv2d_options = CreateDepthwiseConv2DOptions(

579

builder,

580

padding,

581

static_cast<int32_t>(subsampling), static_cast<int32_t>(subsampling),

582

static_cast<int32_t>(group_output_channels),

583

tflite::ActivationFunctionType_NONE,

584

static_cast<int32_t>(dilation), static_cast<int32_t>(dilation));

585

586

flatbuffers::Offset<tflite::Buffer> buffers[3] = {

587

tflite::CreateBuffer(builder, builder.CreateVector({})),

588

tflite::CreateBuffer(builder, builder.CreateVector(

589

reinterpret_cast<const uint8_t*>(kernel.data()),

590

sizeof(float) * kernel.size())),

591

tflite::CreateBuffer(builder, builder.CreateVector(

592

reinterpret_cast<const uint8_t*>(bias.data()),

593

sizeof(float) * bias.size())),

594

};

595

596

const int32_t input_shape[4] = {

597

static_cast<int32_t>(batch_size),

598

static_cast<int32_t>(input_height),

599

static_cast<int32_t>(input_width),

600

static_cast<int32_t>(groups * group_input_channels)

601

};

602

const int32_t output_shape[4] = {

603

static_cast<int32_t>(batch_size),

604

static_cast<int32_t>(output_height),

605

static_cast<int32_t>(output_width),

606

static_cast<int32_t>(groups * group_output_channels)

607

};

608

const int32_t filter_shape[4] = {

609

static_cast<int32_t>(group_output_channels),

610

static_cast<int32_t>(kernel_height),

611

static_cast<int32_t>(kernel_width),

612

static_cast<int32_t>(groups * group_input_channels)

613

};

614

const int32_t bias_shape[1] = {

615

static_cast<int32_t>(groups * group_output_channels)

616

};

617

618

flatbuffers::Offset<tflite::Tensor> tensors[4] = {

619

tflite::CreateTensor(builder,

620

builder.CreateVector<int32_t>(input_shape, 4),

621

tflite::TensorType_FLOAT32,

622

0 /* buffer id */,

623

builder.CreateString("input")),

624

tflite::CreateTensor(builder,

625

builder.CreateVector<int32_t>(filter_shape, 4),

626

tflite::TensorType_FLOAT32,

627

1 /* buffer id */,

628

builder.CreateString("filter")),

629

tflite::CreateTensor(builder,

630

builder.CreateVector<int32_t>(bias_shape, 1),

631

tflite::TensorType_FLOAT32,

632

2 /* buffer id */,

633

builder.CreateString("bias")),

634

tflite::CreateTensor(builder,

635

builder.CreateVector<int32_t>(output_shape, 4),

636

tflite::TensorType_FLOAT32,

637

0 /* buffer id */,

638

builder.CreateString("output")),

639

};

640

641

const int32_t op_inputs[3] = { 0, 1, 2 };

642

const int32_t op_outputs[1] = { 3 };

643

flatbuffers::Offset<tflite::Operator> op = CreateOperator(

644

builder,

645

0 /* opcode_index */,

646

builder.CreateVector<int32_t>(op_inputs, 3),

647

builder.CreateVector<int32_t>(op_outputs, 1),

648

is_depthwise ? tflite::BuiltinOptions_DepthwiseConv2DOptions : tflite::BuiltinOptions_Conv2DOptions,

649

is_depthwise ? dwconv2d_options.Union() : conv2d_options.Union(),

650

/*custom_options */ 0,

651

tflite::CustomOptionsFormat_FLEXBUFFERS);

652

653

const int32_t graph_inputs[1] = { 0 };

654

const int32_t graph_outputs[1] = { 3 };

655

flatbuffers::Offset<tflite::SubGraph> subgraph = CreateSubGraph(

656

builder,

657

builder.CreateVector(tensors, 4),

658

builder.CreateVector<int32_t>(graph_inputs, 1),

659

builder.CreateVector<int32_t>(graph_outputs, 1),

660

builder.CreateVector(&op, 1),

661

builder.CreateString("Conv2D subgraph"));

662

663

flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Conv2D model");

664

665

flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,

666

TFLITE_SCHEMA_VERSION,

667

builder.CreateVector(&operator_code, 1),

668

builder.CreateVector(&subgraph, 1),

669

description,

670

builder.CreateVector(buffers, 3));

671

672

builder.Finish(model_buffer);

673

674

const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());

Chao Mei

f9fdaa7

2021-05-18 23:04:34 -0700

[diff] [blame]

675

tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

676

tflite::InterpreterBuilder interpreterBuilder(model, resolver);

677

std::unique_ptr<tflite::Interpreter> interpreter;

678

if (interpreterBuilder(&interpreter) != kTfLiteOk) {

679

state.SkipWithError("failed to create TFLite interpreter");

680

return;

681

}

682

if (interpreter == nullptr) {

683

state.SkipWithError("TFLite interpreter is null");

684

return;

685

}

686

interpreter->SetNumThreads(1);

687

688

if (interpreter->AllocateTensors() != kTfLiteOk) {

689

state.SkipWithError("failed to allocate tensors");

return;

}

std::generate(

interpreter->typed_tensor<float>(0),

695

interpreter->typed_tensor<float>(0) + batch_size * groups * group_input_channels * input_height * input_width,

696

std::ref(f32rng));

697

698

for (auto _ : state) {

699

state.PauseTiming();

Marat Dukhan

2019-10-23 02:09:02 -0700

[diff] [blame]

700

benchmark::utils::WipeCache();

701

benchmark::utils::PrefetchToL1(

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

702

interpreter->typed_tensor<float>(0),

703

batch_size * groups * group_input_channels * input_height * input_width * sizeof(float));

704

state.ResumeTiming();

705

706

if (interpreter->Invoke() != kTfLiteOk) {

707

state.SkipWithError("failed to invoke TFLite interpreter");

return;

}

}

Marat Dukhan

2020-12-04 14:23:12 -0800

[diff] [blame]

712

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();

713

if (cpu_frequency != 0) {

714

state.counters["cpufreq"] = cpu_frequency;

715

}

716

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

717

state.counters["FLOPS"] = benchmark::Counter(

718

uint64_t(state.iterations()) * 2 *

719

batch_size * output_height * output_width *

720

groups * group_input_channels * group_output_channels *

721

kernel_height * kernel_width,

722

benchmark::Counter::kIsRate);

interpreter.reset();

}

#endif // BENCHMARK_TENSORFLOW_LITE

727

728

#ifdef BENCHMARK_ARM_COMPUTE_LIBRARY

729

static std::string compare_with_convolution_f32_reference_output(

730

const benchmark::State& state, const float* input, size_t input_size,

731

const float* kernel, size_t kernel_size, const float* bias, size_t bias_size,

732

const float* output, size_t output_size)

733

{

734

const size_t batch_size = state.range(0);

735

const size_t input_height = state.range(1);

736

const size_t input_width = state.range(2);

737

const size_t kernel_height = state.range(3);

738

const size_t kernel_width = state.range(4);

739

const size_t padding_height = state.range(5);

740

const size_t padding_width = state.range(6);

741

const size_t subsampling = state.range(7);

742

const size_t dilation = state.range(8);

743

const size_t groups = state.range(9);

744

const size_t group_input_channels = state.range(10);

745

const size_t group_output_channels = state.range(11);

746

747

const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;

748

const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;

749

const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;

750

const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;

751

const size_t input_pixel_stride = groups * group_input_channels;

752

const size_t padding_left = padding_width / 2;

753

const size_t padding_top = padding_height / 2;

754

755

assert(input_size == batch_size * input_height * input_width * groups * group_input_channels);

756

757

assert(kernel_size == group_output_channels * kernel_height * kernel_width * groups * group_input_channels);

758

759

assert(bias_size == groups * group_output_channels);

760

761

assert(output_size == batch_size * output_height * output_width * groups * group_output_channels);

762

763

std::vector<float> output_ref(output_size);

764

for (size_t i = 0; i < batch_size; i++) {

765

for (size_t oy = 0; oy < output_height; oy++) {

766

for (size_t ox = 0; ox < output_width; ox++) {

767

for (size_t g = 0; g < groups; g++) {

768

for (size_t oc = 0; oc < group_output_channels; oc++) {

769

output_ref[(((i * output_height + oy) * output_width + ox) * groups + g) * group_output_channels + oc] =

770

bias[g * group_output_channels + oc];

}

}

}

}

}

for (size_t i = 0; i < batch_size; i++) {

777

for (size_t oy = 0; oy < output_height; oy++) {

778

for (size_t ox = 0; ox < output_width; ox++) {

779

for (size_t ky = 0; ky < kernel_height; ky++) {

780

const size_t iy = oy * subsampling + ky * dilation - padding_top;

781

if (iy < input_height) {

782

for (size_t kx = 0; kx < kernel_width; kx++) {

783

const size_t ix = ox * subsampling + kx * dilation - padding_left;

784

if (ix < input_width) {

785

for (size_t g = 0; g < groups; g++) {

786

for (size_t oc = 0; oc < group_output_channels; oc++) {

787

for (size_t ic = 0; ic < group_input_channels; ic++) {

788

output_ref[(((i * output_height + oy) * output_width + ox) * groups + g) * group_output_channels + oc] +=

789

input[((i * input_height + iy) * input_width + ix) * input_pixel_stride + g * group_input_channels + ic] *

790

kernel[(((oc * kernel_height + ky) * kernel_width + kx) * groups + g) * group_input_channels + ic];

791

} // group_input_channels loop

792

} // group_output_channels loop

793

} // groups loop

794

}

795

} // kernel_width loop

796

}

797

} // kernel_height loop

798

} // output_width loop

799

} // output_height loop

800

} // batch_size loop

801

802

const float relative_error_tolerance = 1e-4;

803

for (size_t i = 0; i < batch_size; i++) {

804

for (size_t y = 0; y < output_height; y++) {

805

for (size_t x = 0; x < output_width; x++) {

806

for (size_t g = 0; g < groups; g++) {

807

for (size_t c = 0; c < group_output_channels; c++) {

808

const size_t idx = (((i * output_height + y) * output_width + x) * groups + g) * group_output_channels + c;

809

const float value_ref = output_ref[idx];

810

const float value = output[idx];

811

if (std::abs(value - value_ref) > std::max(std::abs(value_ref) * relative_error_tolerance, std::numeric_limits<float>::epsilon())) {

812

std::ostringstream error_stream;

813

error_stream << "(x, y) = (" << x << ", " << y << "), group = " << g

814

<< ", channel = " << c << ", refValue = " << value_ref

815

<< ", actualValue = " << value

816

<< ", absDiff=" << std::abs(value - value_ref);

817

return error_stream.str();

}

}

}

}

}

}

return "";

}

void armcl_convolution_f32(benchmark::State& state, const char* net) {

828

const size_t batch_size = state.range(0);

829

const size_t input_height = state.range(1);

830

const size_t input_width = state.range(2);

831

const size_t kernel_height = state.range(3);

832

const size_t kernel_width = state.range(4);

833

const size_t padding_height = state.range(5);

834

const size_t padding_width = state.range(6);

835

const size_t subsampling = state.range(7);

836

const size_t dilation = state.range(8);

837

const size_t groups = state.range(9);

838

const size_t group_input_channels = state.range(10);

839

const size_t group_output_channels = state.range(11);

840

841

const size_t effective_kernel_height = (kernel_height - 1) * dilation + 1;

842

const size_t effective_kernel_width = (kernel_width - 1) * dilation + 1;

843

const size_t padding_left = padding_width / 2;

844

const size_t padding_top = padding_height / 2;

845

const size_t padding_right = padding_width - padding_left;

846

const size_t padding_bottom = padding_height - padding_top;

847

const size_t output_height = (input_height + padding_height - effective_kernel_height) / subsampling + 1;

848

const size_t output_width = (input_width + padding_width - effective_kernel_width) / subsampling + 1;

849

850

arm_compute::PadStrideInfo pad_stride_info(

851

subsampling /* stride height */,

852

subsampling /* stride width */,

853

padding_left, padding_right, padding_top, padding_bottom,

854

arm_compute::DimensionRoundingType::FLOOR);

855

arm_compute::Size2D dilation_info(dilation, dilation);

856

// Note: activation is disabled by default.

857

arm_compute::ActivationLayerInfo activation_info;

858

859

// Note: no batch size and reverse order of dimensions, i.e. CWHN for NHWC.

860

arm_compute::TensorShape input_shape(

861

/* C */ groups * group_input_channels,

862

/* W */ input_width,

863

/* H */ input_height,

864

/* N */ batch_size);

865

arm_compute::TensorInfo input_info(

866

input_shape,

867

1 /* number of channels per element (!) */,

868

arm_compute::DataType::F32);

869

input_info.set_data_layout(arm_compute::DataLayout::NHWC);

870

arm_compute::Tensor input_tensor;

871

input_tensor.allocator()->init(input_info);

872

input_tensor.allocator()->allocate();

873

874

// Note: reverse order of dimensions, i.e. for IWHO for OHWI.

875

arm_compute::TensorShape kernel_shape(

876

/* I */ groups * group_input_channels,

877

/* W */ kernel_width,

878

/* H */ kernel_height,

879

/* O */ group_output_channels);

880

arm_compute::TensorInfo kernel_info(

881

kernel_shape,

882

1 /* number of channels per element (!) */,

883

arm_compute::DataType::F32);

884

kernel_info.set_data_layout(arm_compute::DataLayout::NHWC);

885

arm_compute::Tensor kernelTensor;

886

kernelTensor.allocator()->init(kernel_info);

887

kernelTensor.allocator()->allocate();

888

889

arm_compute::TensorShape bias_shape(groups * group_output_channels);

890

arm_compute::TensorInfo bias_info(

891

bias_shape,

892

1 /* number of channels per element (!) */,

893

arm_compute::DataType::F32);

894

bias_info.set_data_layout(arm_compute::DataLayout::NHWC);

895

arm_compute::Tensor bias_tensor;

896

bias_tensor.allocator()->init(bias_info);

897

bias_tensor.allocator()->allocate();

898

899

// Note: no batch size and reverse order of dimensions, i.e. CWHN for NHWC.

900

arm_compute::TensorShape output_shape(

901

/* C */ groups * group_output_channels,

902

/* W */ output_width,

903

/* H */ output_height,

904

/* N */ batch_size);

905

arm_compute::TensorInfo output_info(

906

output_shape,

907

1 /* number of channels per element (!) */,

908

arm_compute::DataType::F32);

909

output_info.set_data_layout(arm_compute::DataLayout::NHWC);

910

arm_compute::Tensor output_tensor;

911

output_tensor.allocator()->init(output_info);

912

output_tensor.allocator()->allocate();

913

914

std::random_device random_device;

915

auto rng = std::mt19937(random_device());

Marat Dukhan

2020-08-02 21:46:58 -0700

[diff] [blame]

916

auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

917

918

std::generate(

919

reinterpret_cast<float*>(input_tensor.buffer()),

920

reinterpret_cast<float*>(input_tensor.buffer()) + input_shape.total_size(),

921

std::ref(f32rng));

922

std::generate(

923

reinterpret_cast<float*>(kernelTensor.buffer()),

924

reinterpret_cast<float*>(kernelTensor.buffer()) + kernel_shape.total_size(),

925

std::ref(f32rng));

926

std::generate(

927

reinterpret_cast<float*>(bias_tensor.buffer()),

928

reinterpret_cast<float*>(bias_tensor.buffer()) + bias_shape.total_size(),

929

std::ref(f32rng));

930

std::generate(

931

reinterpret_cast<float*>(output_tensor.buffer()),

932

reinterpret_cast<float*>(output_tensor.buffer()) + output_shape.total_size(),

933

std::ref(f32rng));

934

935

bool is_depthwise = false;

936

if (groups != 1) {

937

// NEConvolutionLayer uses NEGEMMConvolutionLayer by default, which doesn't support grouped convolution.

938

// However, depthwise convolution is supported via NEDepthwiseConvolutionLayer.

939

if (group_input_channels == 1) {

940

is_depthwise = true;

941

} else {

942

state.SkipWithError("grouped convolution is not supported");

return;

}

}

std::shared_ptr<arm_compute::IFunction> layer;

948

if (is_depthwise) {

949

if (dilation != 1) {

950

state.SkipWithError("dilated depthwise convolution is not supported");

return;

}

// Avoid NEDepthwiseConvolutionLayer3x3 when stride isn't 2 in order to pass the output verification.

955

// TODO(b/130206370) This looks like a bug and needs further investigation.

956

if (kernel_height == 3 && kernel_width == 3 && subsampling == 2) {

957

auto* depthwise_3x3_convolution_layer = new arm_compute::NEDepthwiseConvolutionLayer3x3();

958

layer.reset(depthwise_3x3_convolution_layer);

959

depthwise_3x3_convolution_layer->configure(

960

&input_tensor, &kernelTensor, &bias_tensor, &output_tensor,

961

pad_stride_info, group_output_channels, activation_info);

962

963

if (!depthwise_3x3_convolution_layer->validate(

964

&input_info, &kernel_info, &bias_info, &output_info,

965

pad_stride_info, group_output_channels, activation_info))

966

{

967

state.SkipWithError("validation failed");

return;

}

} else {

auto* depthwise_convolution_layer = new arm_compute::NEDepthwiseConvolutionLayer();

972

layer.reset(depthwise_convolution_layer);

973

depthwise_convolution_layer->configure(

974

&input_tensor, &kernelTensor, &bias_tensor, &output_tensor,

975

pad_stride_info, group_output_channels, activation_info);

976

977

if (!depthwise_convolution_layer->validate(

978

&input_info, &kernel_info, &bias_info, &output_info,

979

pad_stride_info, group_output_channels, activation_info))

980

{

981

state.SkipWithError("validation failed");

return;

}

}

} else {

auto* convolution_layer = new arm_compute::NEConvolutionLayer();

987

layer.reset(convolution_layer);

988

convolution_layer->configure(

989

&input_tensor, &kernelTensor, &bias_tensor, &output_tensor,

990

pad_stride_info, arm_compute::WeightsInfo(), dilation_info, activation_info,

991

true /* enable fast math */, groups);

992

993

if (!convolution_layer->validate(

994

&input_info, &kernel_info, &bias_info, &output_info,

995

pad_stride_info, arm_compute::WeightsInfo(), dilation_info, activation_info,

996

true /* enable fast math */, groups))

997

{

998

state.SkipWithError("validation failed");

return;

}

}

// Dry run to let ACL do one-time initializations.

1004

arm_compute::CPPScheduler::get().set_num_threads(1);

1005

layer->run();

1006

1007

for (auto _ : state) {

1008

state.PauseTiming();

Marat Dukhan

2019-10-23 02:09:02 -0700

[diff] [blame]

1009

benchmark::utils::WipeCache();

1010

benchmark::utils::PrefetchToL1(

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

1011

input_tensor.buffer(),

1012

batch_size * groups * group_input_channels * input_height * input_width * sizeof(float));

1013

state.ResumeTiming();

layer->run();

}

// Validate outputs.

const std::string error_string = compare_with_convolution_f32_reference_output(

1020

state, reinterpret_cast<const float*>(input_tensor.buffer()),

1021

input_shape.total_size(),

1022

reinterpret_cast<const float*>(kernelTensor.buffer()),

1023

kernel_shape.total_size(),

1024

reinterpret_cast<const float*>(bias_tensor.buffer()),

1025

bias_shape.total_size(),

1026

reinterpret_cast<const float*>(output_tensor.buffer()),

1027

output_shape.total_size());

1028

1029

if (!error_string.empty()) {

1030

state.SkipWithError(("validation failed: " + error_string).c_str());

return;

}

input_tensor.allocator()->free();

1035

kernelTensor.allocator()->free();

1036

bias_tensor.allocator()->free();

1037

output_tensor.allocator()->free();

1038

Marat Dukhan

2020-12-04 14:23:12 -0800

[diff] [blame]

1039

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();

1040

if (cpu_frequency != 0) {

1041

state.counters["cpufreq"] = cpu_frequency;

1042

}

1043

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

1044

state.counters["FLOPS"] = benchmark::Counter(

1045

uint64_t(state.iterations()) * 2 *

1046

batch_size * output_height * output_width *

1047

groups * group_input_channels * group_output_channels *

1048

kernel_height * kernel_width,

1049

benchmark::Counter::kIsRate);

1050

}

1051

#endif // BENCHMARK_ARM_COMPUTE_LIBRARY

1052

1053

// ShuffleNet v1 with 1 group.

1054

static void ShuffleNetV1G1(benchmark::internal::Benchmark* b) {

1055

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1056

1057

/*************************** Conv 1 **************************/

1058

/* N H W KH KW PH PW S D G GCin GCout */

1059

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});

1060

/******************* Stage 2: stride-2 unit ******************/

1061

/* N H W KH KW PH PW S D G GCin GCout */

1062

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 36});

1063

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 36, 1, 1});

1064

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 36, 120});

1065

/******************* Stage 2: stride-1 units *****************/

1066

/* N H W KH KW PH PW S D G GCin GCout */

1067

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 144, 36});

1068

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 36, 1, 1});

1069

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 36, 144});

1070

/******************* Stage 3: stride-2 unit ******************/

1071

/* N H W KH KW PH PW S D G GCin GCout */

1072

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 144, 72});

1073

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 72, 1, 1});

1074

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 72, 144});

1075

/******************* Stage 3: stride-1 units *****************/

1076

/* N H W KH KW PH PW S D G GCin GCout */

1077

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 288, 72});

1078

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 72, 1, 1});

1079

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 72, 288});

1080

/******************* Stage 4: stride-2 unit ******************/

1081

/* N H W KH KW PH PW S D G GCin GCout */

1082

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 288, 144});

1083

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 144, 1, 1});

1084

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 144, 288});

1085

/******************* Stage 4: stride-1 units *****************/

1086

/* N H W KH KW PH PW S D G GCin GCout */

1087

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 144});

1088

b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 144, 1, 1});

1089

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 144, 576});

1090

}

1091

1092

// ShuffleNet v1 with 2 groups.

1093

static void ShuffleNetV1G2(benchmark::internal::Benchmark* b) {

1094

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1095

1096

/*************************** Conv 1 **************************/

1097

/* N H W KH KW PH PW S D G GCin GCout */

1098

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});

1099

/******************* Stage 2: stride-2 unit ******************/

1100

/* N H W KH KW PH PW S D G GCin GCout */

1101

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 50});

1102

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 50, 1, 1});

1103

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 25, 88});

1104

/******************* Stage 2: stride-1 units *****************/

1105

/* N H W KH KW PH PW S D G GCin GCout */

1106

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 100, 25});

1107

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 50, 1, 1});

1108

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 25, 100});

1109

/******************* Stage 3: stride-2 unit ******************/

1110

/* N H W KH KW PH PW S D G GCin GCout */

1111

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 2, 100, 50});

1112

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 100, 1, 1});

1113

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 50, 100});

1114

/******************* Stage 3: stride-1 units *****************/

1115

/* N H W KH KW PH PW S D G GCin GCout */

1116

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 200, 50});

1117

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 100, 1, 1});

1118

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 50, 200});

1119

/******************* Stage 4: stride-2 unit ******************/

1120

/* N H W KH KW PH PW S D G GCin GCout */

1121

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 2, 200, 100});

1122

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 200, 1, 1});

1123

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 2, 100, 200});

1124

/******************* Stage 4: stride-1 units *****************/

1125

/* N H W KH KW PH PW S D G GCin GCout */

1126

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 2, 400, 100});

1127

b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 200, 1, 1});

1128

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 2, 100, 400});

1129

}

1130

1131

// ShuffleNet v1 with 3 groups.

1132

static void ShuffleNetV1G3(benchmark::internal::Benchmark* b) {

1133

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1134

1135

/*************************** Conv 1 **************************/

1136

/* N H W KH KW PH PW S D G GCin GCout */

1137

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});

1138

/******************* Stage 2: stride-2 unit ******************/

1139

/* N H W KH KW PH PW S D G GCin GCout */

1140

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 60});

1141

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 60, 1, 1});

1142

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 20, 72});

1143

/******************* Stage 2: stride-1 units *****************/

1144

/* N H W KH KW PH PW S D G GCin GCout */

1145

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 80, 20});

1146

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 60, 1, 1});

1147

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 20, 80});

1148

/******************* Stage 3: stride-2 unit ******************/

1149

/* N H W KH KW PH PW S D G GCin GCout */

1150

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 3, 80, 40});

1151

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 120, 1, 1});

1152

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 40, 80});

1153

/******************* Stage 3: stride-1 units *****************/

1154

/* N H W KH KW PH PW S D G GCin GCout */

1155

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 160, 40});

1156

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 120, 1, 1});

1157

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 40, 160});

1158

/******************* Stage 4: stride-2 unit ******************/

1159

/* N H W KH KW PH PW S D G GCin GCout */

1160

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 3, 160, 80});

1161

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 240, 1, 1});

1162

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 3, 80, 160});

1163

/******************* Stage 4: stride-1 units *****************/

1164

/* N H W KH KW PH PW S D G GCin GCout */

1165

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 3, 320, 80});

1166

b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 240, 1, 1});

1167

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 3, 80, 320});

1168

}

1169

1170

// ShuffleNet v1 with 4 groups.

1171

static void ShuffleNetV1G4(benchmark::internal::Benchmark* b) {

1172

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1173

1174

/*************************** Conv 1 **************************/

1175

/* N H W KH KW PH PW S D G GCin GCout */

1176

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});

1177

/******************* Stage 2: stride-2 unit ******************/

1178

/* N H W KH KW PH PW S D G GCin GCout */

1179

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 68});

1180

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 68, 1, 1});

1181

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 17, 62});

1182

/******************* Stage 2: stride-1 units *****************/

1183

/* N H W KH KW PH PW S D G GCin GCout */

1184

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 68, 17});

1185

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 68, 1, 1});

1186

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 17, 68});

1187

/******************* Stage 3: stride-2 unit ******************/

1188

/* N H W KH KW PH PW S D G GCin GCout */

1189

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 4, 68, 34});

1190

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 136, 1, 1});

1191

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 34, 68});

1192

/******************* Stage 3: stride-1 units *****************/

1193

/* N H W KH KW PH PW S D G GCin GCout */

1194

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 136, 34});

1195

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 136, 1, 1});

1196

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 34, 136});

1197

/******************* Stage 4: stride-2 unit ******************/

1198

/* N H W KH KW PH PW S D G GCin GCout */

1199

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 4, 136, 68});

1200

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 272, 1, 1});

1201

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 4, 68, 136});

1202

/******************* Stage 4: stride-1 units *****************/

1203

/* N H W KH KW PH PW S D G GCin GCout */

1204

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 4, 272, 68});

1205

b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 272, 1, 1});

1206

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 4, 68, 272});

1207

}

1208

1209

// ShuffleNet v1 with 8 groups.

1210

static void ShuffleNetV1G8(benchmark::internal::Benchmark* b) {

1211

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1212

1213

/*************************** Conv 1 **************************/

1214

/* N H W KH KW PH PW S D G GCin GCout */

1215

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});

1216

/******************* Stage 2: stride-2 unit ******************/

1217

/* N H W KH KW PH PW S D G GCin GCout */

1218

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 96});

1219

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 96, 1, 1});

1220

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 12, 45});

1221

/******************* Stage 2: stride-1 units *****************/

1222

/* N H W KH KW PH PW S D G GCin GCout */

1223

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 48, 12});

1224

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 96, 1, 1});

1225

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 12, 48});

1226

/******************* Stage 3: stride-2 unit ******************/

1227

/* N H W KH KW PH PW S D G GCin GCout */

1228

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 8, 48, 24});

1229

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 192, 1, 1});

1230

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 24, 48});

1231

/******************* Stage 3: stride-1 units *****************/

1232

/* N H W KH KW PH PW S D G GCin GCout */

1233

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 96, 24});

1234

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 192, 1, 1});

1235

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 24, 96});

1236

/******************* Stage 4: stride-2 unit ******************/

1237

/* N H W KH KW PH PW S D G GCin GCout */

1238

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 8, 96, 48});

1239

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 384, 1, 1});

1240

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 8, 48, 96});

1241

/******************* Stage 4: stride-1 units *****************/

1242

/* N H W KH KW PH PW S D G GCin GCout */

1243

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 8, 192, 48});

1244

b->Args({1, 7, 7, 3, 3, 2, 2, 2, 1, 384, 1, 1});

1245

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 8, 48, 192});

1246

}

1247

1248

// ShuffleNet v2 (0.5X scale)

1249

static void ShuffleNetV2X05(benchmark::internal::Benchmark* b) {

1250

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1251

1252

/*************************** Conv 1 **************************/

1253

/* N H W KH KW PH PW S D G GCin GCout */

1254

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});

1255

/************************** Stage 2 **************************/

1256

/* N H W KH KW PH PW S D G GCin GCout */

1257

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});

1258

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 24});

1259

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 24});

1260

b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 24, 1, 1});

1261

/************************** Stage 3 **************************/

1262

/* N H W KH KW PH PW S D G GCin GCout */

1263

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 48, 1, 1});

1264

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 48, 48});

1265

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 48, 48});

1266

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 48, 1, 1});

1267

/************************** Stage 4 **************************/

1268

/* N H W KH KW PH PW S D G GCin GCout */

1269

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 96, 1, 1});

1270

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 96});

1271

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 96});

1272

b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 96, 1, 1});

1273

/*************************** Conv 5 **************************/

1274

/* N H W KH KW PH PW S D G GCin GCout */

1275

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 192, 1024});

1276

}

1277

1278

// ShuffleNet v2 (1.0X scale)

1279

static void ShuffleNetV2X10(benchmark::internal::Benchmark* b) {

1280

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1281

1282

/*************************** Conv 1 **************************/

1283

/* N H W KH KW PH PW S D G GCin GCout */

1284

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});

1285

/************************** Stage 2 **************************/

1286

/* N H W KH KW PH PW S D G GCin GCout */

1287

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});

1288

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 58});

1289

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 58});

1290

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 58, 1, 1});

1291

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 58, 58});

1292

b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 58, 1, 1});

1293

/************************** Stage 3 **************************/

1294

/* N H W KH KW PH PW S D G GCin GCout */

1295

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 116, 1, 1});

1296

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 116, 116});

1297

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 116, 116});

1298

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 116, 1, 1});

1299

/************************** Stage 4 **************************/

1300

/* N H W KH KW PH PW S D G GCin GCout */

1301

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 232, 1, 1});

1302

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 232, 232});

1303

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 232, 232});

1304

b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 232, 1, 1});

1305

/*************************** Conv 5 **************************/

1306

/* N H W KH KW PH PW S D G GCin GCout */

1307

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 464, 1024});

1308

}

1309

1310

// ShuffleNet v2 (1.5X scale)

1311

static void ShuffleNetV2X15(benchmark::internal::Benchmark* b) {

1312

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1313

1314

/*************************** Conv 1 **************************/

1315

/* N H W KH KW PH PW S D G GCin GCout */

1316

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});

1317

/************************** Stage 2 **************************/

1318

/* N H W KH KW PH PW S D G GCin GCout */

1319

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});

1320

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 88});

1321

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 88});

1322

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 88, 1, 1});

1323

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 88, 88});

1324

b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 88, 1, 1});

1325

/************************** Stage 3 **************************/

1326

/* N H W KH KW PH PW S D G GCin GCout */

1327

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 176, 1, 1});

1328

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 176, 176});

1329

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 176, 176});

1330

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 176, 1, 1});

1331

/************************** Stage 4 **************************/

1332

/* N H W KH KW PH PW S D G GCin GCout */

1333

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 352, 1, 1});

1334

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 352, 352});

1335

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 352, 352});

1336

b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 352, 1, 1});

1337

/*************************** Conv 5 **************************/

1338

/* N H W KH KW PH PW S D G GCin GCout */

1339

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 704, 1024});

1340

}

1341

1342

// ShuffleNet v2 (2.0X scale)

1343

static void ShuffleNetV2X20(benchmark::internal::Benchmark* b) {

1344

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1345

1346

/*************************** Conv 1 **************************/

1347

/* N H W KH KW PH PW S D G GCin GCout */

1348

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 24});

1349

/************************** Stage 2 **************************/

1350

/* N H W KH KW PH PW S D G GCin GCout */

1351

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 24, 1, 1});

1352

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 122});

1353

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 122});

1354

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 122, 1, 1});

1355

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 122, 122});

1356

b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 122, 1, 1});

1357

/************************** Stage 3 **************************/

1358

/* N H W KH KW PH PW S D G GCin GCout */

1359

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 244, 1, 1});

1360

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 244, 244});

1361

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 244, 244});

1362

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 244, 1, 1});

1363

/************************** Stage 4 **************************/

1364

/* N H W KH KW PH PW S D G GCin GCout */

1365

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 488, 1, 1});

1366

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 488, 488});

1367

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 488, 488});

1368

b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 488, 1, 1});

1369

/*************************** Conv 5 **************************/

1370

/* N H W KH KW PH PW S D G GCin GCout */

1371

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 976, 2048});

1372

}

1373

1374

static void MobileNetV1(benchmark::internal::Benchmark* b) {

1375

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1376

1377

/* N H W KH KW PH PW S D G GCin GCout */

1378

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 32});

1379

b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 32, 1, 1});

1380

b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 32, 64});

1381

b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 64, 1, 1});

1382

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 128});

1383

b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 128, 1, 1});

1384

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 128, 128});

1385

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 128, 1, 1});

1386

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 128, 256});

1387

b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 256, 1, 1});

1388

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 256, 256});

1389

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 256, 1, 1});

1390

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 256, 512});

1391

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 512, 1, 1});

1392

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 512, 512});

1393

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 512, 1, 1});

1394

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 512, 1024});

1395

b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 1024, 1, 1});

1396

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 1024, 1024});

1397

}

1398

1399

static void MobileNetV2(benchmark::internal::Benchmark* b) {

1400

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1401

1402

/* N H W KH KW PH PW S D G GCin GCout */

1403

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 32});

1404

1405

/************************ Bottleneck 1 ***********************/

1406

/* N H W KH KW PH PW S D G GCin GCout */

1407

b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 32, 1, 1});

1408

b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 32, 16});

1409

1410

/************************ Bottleneck 2 ***********************/

1411

/* N H W KH KW PH PW S D G GCin GCout */

1412

b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 16, 96});

1413

b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 96, 1, 1});

1414

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 96, 24});

1415

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 144});

1416

b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 144, 1, 1});

1417

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 144, 24});

1418

1419

/************************ Bottleneck 3 ***********************/

1420

/* N H W KH KW PH PW S D G GCin GCout */

1421

//b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 144});

1422

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 144, 1, 1});

1423

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 144, 32});

1424

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 32, 192});

1425

b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 192, 1, 1});

1426

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 192, 32});

1427

//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 32, 192});

1428

//b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 192, 1, 1});

1429

//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 192, 32});

1430

1431

/************************ Bottleneck 4 ***********************/

1432

/* N H W KH KW PH PW S D G GCin GCout */

1433

//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 32, 192});

1434

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 192, 1, 1});

1435

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 192, 64});

1436

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});

1437

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});

1438

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 64});

1439

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});

1440

//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});

1441

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 64});

1442

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});

1443

//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});

1444

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 64});

1445

1446

/************************ Bottleneck 5 ***********************/

1447

/* N H W KH KW PH PW S D G GCin GCout */

1448

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 64, 384});

1449

//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 384, 1, 1});

1450

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 384, 96});

1451

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 576});

1452

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 576, 1, 1});

1453

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 576, 96});

1454

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 576});

1455

//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 576, 1, 1});

1456

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 576, 96});

1457

1458

/************************ Bottleneck 6 ***********************/

1459

/* N H W KH KW PH PW S D G GCin GCout */

1460

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 576});

1461

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 576, 1, 1});

1462

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 160});

1463

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});

1464

b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 960, 1, 1});

1465

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});

1466

//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});

1467

//b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 960, 1, 1});

1468

//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});

1469

1470

/************************ Bottleneck 7 ***********************/

1471

/* N H W KH KW PH PW S D G GCin GCout */

1472

//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});

1473

//b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 960, 1, 1});

1474

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 320});

1475

1476

/******************** Pre-pooling Conv2D *********************/

1477

/* N H W KH KW PH PW S D G GCin GCout */

1478

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 320, 1280});

1479

/******************** Post-pooling Conv2D ********************/

1480

/* N H W KH KW PH PW S D G GCin GCout */

1481

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1280, 1000});

1482

}

1483

1484

static void MobileNetV3Small(benchmark::internal::Benchmark* b) {

1485

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1486

1487

/*********************** Initial Stage ***********************/

1488

/* N H W KH KW PH PW S D G GCin GCout */

1489

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 16});

1490

/*********************** Bottleneck 1 ************************/

1491

/* N H W KH KW PH PW S D G GCin GCout */

1492

b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 16, 1, 1});

1493

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 16, 8});

1494

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 8, 16});

1495

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 16, 16});

1496

/*********************** Bottleneck 2 ************************/

1497

/* N H W KH KW PH PW S D G GCin GCout */

1498

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 16, 72});

1499

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 72, 1, 1});

1500

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 72, 24});

1501

/*********************** Bottleneck 3 ************************/

1502

/* N H W KH KW PH PW S D G GCin GCout */

1503

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 88});

1504

b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 88, 1, 1});

1505

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 88, 24});

1506

/*********************** Bottleneck 4 ************************/

1507

/* N H W KH KW PH PW S D G GCin GCout */

1508

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 24, 96});

1509

b->Args({1, 28, 28, 5, 5, 4, 4, 2, 1, 96, 1, 1});

1510

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 96, 24});

1511

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 24, 96});

1512

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 96, 40});

1513

/*********************** Bottleneck 5 ************************/

1514

/* N H W KH KW PH PW S D G GCin GCout */

1515

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 40, 240});

1516

b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 240, 1, 1});

1517

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 64});

1518

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 64, 240});

1519

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 240, 40});

1520

/*********************** Bottleneck 6 ************************/

1521

/* N H W KH KW PH PW S D G GCin GCout */

1522

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 40, 240});

1523

//b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 240, 1, 1});

1524

//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 64});

1525

//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 64, 240});

1526

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 240, 40});

1527

/*********************** Bottleneck 7 ************************/

1528

/* N H W KH KW PH PW S D G GCin GCout */

1529

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 40, 120});

1530

b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 120, 1, 1});

1531

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 32});

1532

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 32, 120});

1533

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 120, 48});

1534

/*********************** Bottleneck 8 ************************/

1535

/* N H W KH KW PH PW S D G GCin GCout */

1536

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 48, 144});

1537

b->Args({1, 14, 14, 5, 5, 4, 4, 1, 1, 144, 1, 1});

1538

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 144, 40});

1539

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 40, 144});

1540

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 144, 48});

1541

/*********************** Bottleneck 9 ************************/

1542

/* N H W KH KW PH PW S D G GCin GCout */

1543

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 48, 288});

1544

b->Args({1, 14, 14, 5, 5, 4, 4, 2, 1, 288, 1, 1});

1545

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 288, 72});

1546

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 72, 288});

1547

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 288, 96});

1548

/*********************** Bottleneck 10 ***********************/

1549

/* N H W KH KW PH PW S D G GCin GCout */

1550

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 576});

1551

b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 576, 1, 1});

1552

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 576, 144});

1553

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 144, 576});

1554

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 96});

1555

/*********************** Bottleneck 11 ***********************/

1556

/* N H W KH KW PH PW S D G GCin GCout */

1557

//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 576});

1558

//b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 576, 1, 1});

1559

//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 576, 144});

1560

//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 144, 576});

1561

//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 576, 96});

1562

/************************ Last Stage ************************/

1563

/* N H W KH KW PH PW S D G GCin GCout */

1564

//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 96, 576});

1565

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 576, 1024});

1566

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1024, 1001});

1567

}

1568

1569

static void MobileNetV3Large(benchmark::internal::Benchmark* b) {

1570

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1571

1572

/*********************** Initial Stage ***********************/

1573

/* N H W KH KW PH PW S D G GCin GCout */

1574

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 16});

1575

/*********************** Bottleneck 1 ************************/

1576

/* N H W KH KW PH PW S D G GCin GCout */

1577

b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 16, 1, 1});

1578

b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 16, 16});

1579

/*********************** Bottleneck 2 ************************/

1580

/* N H W KH KW PH PW S D G GCin GCout */

1581

b->Args({1, 112, 112, 1, 1, 0, 0, 1, 1, 1, 16, 64});

1582

b->Args({1, 112, 112, 3, 3, 2, 2, 2, 1, 64, 1, 1});

1583

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 24});

1584

/*********************** Bottleneck 3 ************************/

1585

/* N H W KH KW PH PW S D G GCin GCout */

1586

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 72});

1587

b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 72, 1, 1});

1588

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 72, 24});

1589

/*********************** Bottleneck 4 ************************/

1590

/* N H W KH KW PH PW S D G GCin GCout */

1591

//b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 24, 72});

1592

b->Args({1, 56, 56, 5, 5, 4, 4, 2, 1, 72, 1, 1});

1593

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 72, 24});

1594

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 24, 72});

1595

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 72, 40});

1596

/*********************** Bottleneck 5 ************************/

1597

/* N H W KH KW PH PW S D G GCin GCout */

1598

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 40, 120});

1599

b->Args({1, 28, 28, 5, 5, 4, 4, 1, 1, 120, 1, 1});

1600

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 32});

1601

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 32, 120});

1602

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 120, 40});

1603

/*********************** Bottleneck 6 ************************/

1604

/* N H W KH KW PH PW S D G GCin GCout */

1605

//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 40, 120});

1606

//b->Args({1, 28, 28, 5, 5, 4, 4, 1, 1, 120, 1, 1});

1607

//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 32});

1608

//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 32, 120});

1609

//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 120, 40});

1610

/*********************** Bottleneck 7 ************************/

1611

/* N H W KH KW PH PW S D G GCin GCout */

1612

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 40, 240});

1613

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 240, 1, 1});

1614

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 240, 80});

1615

/*********************** Bottleneck 8 ************************/

1616

/* N H W KH KW PH PW S D G GCin GCout */

1617

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 200});

1618

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 200, 1, 1});

1619

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 200, 80});

1620

/*********************** Bottleneck 9 ************************/

1621

/* N H W KH KW PH PW S D G GCin GCout */

1622

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 184});

1623

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 184, 1, 1});

1624

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 184, 80});

1625

/********************** Bottleneck 10 ***********************/

1626

/* N H W KH KW PH PW S D G GCin GCout */

1627

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 184});

1628

//b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 184, 1, 1});

1629

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 184, 80});

1630

/********************** Bottleneck 11 ***********************/

1631

/* N H W KH KW PH PW S D G GCin GCout */

1632

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 80, 480});

1633

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 480, 1, 1});

1634

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 480, 120});

1635

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 120, 480});

1636

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 480, 112});

1637

/********************** Bottleneck 12 ***********************/

1638

/* N H W KH KW PH PW S D G GCin GCout */

1639

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 112, 672});

1640

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 672, 1, 1});

1641

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 672, 168});

1642

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 168, 672});

1643

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 672, 112});

1644

/********************** Bottleneck 13 ***********************/

1645

/* N H W KH KW PH PW S D G GCin GCout */

1646

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 112, 672});

1647

b->Args({1, 14, 14, 5, 5, 4, 4, 2, 1, 672, 1, 1});

1648

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 672, 160});

1649

/********************** Bottleneck 14 ***********************/

1650

/* N H W KH KW PH PW S D G GCin GCout */

1651

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});

1652

b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 960, 1, 1});

1653

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 960, 240});

1654

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 960});

1655

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});

1656

/********************** Bottleneck 15 ***********************/

1657

/* N H W KH KW PH PW S D G GCin GCout */

1658

//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});

1659

//b->Args({1, 7, 7, 5, 5, 4, 4, 1, 1, 960, 1, 1});

1660

//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 960, 240});

1661

//b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 240, 960});

1662

//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 960, 160});

1663

/************************ Last Stage ***********************/

1664

/* N H W KH KW PH PW S D G GCin GCout */

1665

//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 160, 960});

1666

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 960, 1280});

1667

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1280, 1001});

}

// SqueezeNet 1.0

static void SqueezeNetV10(benchmark::internal::Benchmark* b) {

1672

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1673

1674

/************************** Conv 1 *************************/

1675

/* N H W KH KW PH PW S D G GCin GCout */

1676

b->Args({1, 224, 224, 7, 7, 6, 6, 2, 1, 1, 3, 96});

1677

/************************** Fire 2 *************************/

1678

/* N H W KH KW PH PW S D G GCin GCout */

1679

b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 96, 16});

1680

b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});

1681

b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});

1682

/************************** Fire 3 *************************/

1683

/* N H W KH KW PH PW S D G GCin GCout */

1684

b->Args({1, 56, 55, 1, 1, 0, 0, 1, 1, 1, 128, 16});

1685

//b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});

1686

//b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});

1687

/************************** Fire 4 *************************/

1688

/* N H W KH KW PH PW S D G GCin GCout */

1689

b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 128, 32});

1690

b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 32, 128});

1691

b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 32, 128});

1692

/************************** Fire 5 *************************/

1693

/* N H W KH KW PH PW S D G GCin GCout */

1694

b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 256, 32});

1695

b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 32, 128});

1696

b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 32, 128});

1697

/************************** Fire 6 *************************/

1698

/* N H W KH KW PH PW S D G GCin GCout */

1699

b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 256, 48});

1700

b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 48, 192});

1701

b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 48, 192});

1702

/************************** Fire 7 *************************/

1703

/* N H W KH KW PH PW S D G GCin GCout */

1704

b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 384, 48});

1705

//b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 48, 192});

1706

//b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 48, 192});

1707

/************************** Fire 8 *************************/

1708

/* N H W KH KW PH PW S D G GCin GCout */

1709

b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 384, 64});

1710

b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 64, 256});

1711

b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 64, 256});

1712

/************************** Fire 9 *************************/

1713

/* N H W KH KW PH PW S D G GCin GCout */

1714

b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 64});

1715

b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 64, 256});

1716

b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 64, 256});

1717

/************************* Conv 10 *************************/

1718

/* N H W KH KW PH PW S D G GCin GCout */

1719

b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 1000});

}

// SqueezeNet 1.1

static void SqueezeNetV11(benchmark::internal::Benchmark* b) {

1724

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1725

1726

/************************** Conv 1 *************************/

1727

/* N H W KH KW PH PW S D G GCin GCout */

1728

b->Args({1, 224, 224, 3, 3, 2, 2, 2, 1, 1, 3, 64});

1729

/************************** Fire 2 *************************/

1730

/* N H W KH KW PH PW S D G GCin GCout */

1731

b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 64, 16});

1732

b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});

1733

b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});

1734

/************************** Fire 3 *************************/

1735

/* N H W KH KW PH PW S D G GCin GCout */

1736

b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 128, 16});

1737

//b->Args({1, 55, 55, 1, 1, 0, 0, 1, 1, 1, 16, 64});

1738

//b->Args({1, 55, 55, 3, 3, 2, 2, 1, 1, 1, 16, 64});

1739

/************************** Fire 4 *************************/

1740

/* N H W KH KW PH PW S D G GCin GCout */

1741

b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 128, 32});

1742

b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 32, 128});

1743

b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 32, 128});

1744

/************************** Fire 5 *************************/

1745

/* N H W KH KW PH PW S D G GCin GCout */

1746

b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 256, 32});

1747

//b->Args({1, 27, 27, 1, 1, 0, 0, 1, 1, 1, 32, 128});

1748

//b->Args({1, 27, 27, 3, 3, 2, 2, 1, 1, 1, 32, 128});

1749

/************************** Fire 6 *************************/

1750

/* N H W KH KW PH PW S D G GCin GCout */

1751

b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 256, 48});

1752

b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 48, 192});

1753

b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 48, 192});

1754

/************************** Fire 7 *************************/

1755

/* N H W KH KW PH PW S D G GCin GCout */

1756

b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 384, 48});

1757

//b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 48, 192});

1758

//b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 48, 192});

1759

/************************** Fire 8 *************************/

1760

/* N H W KH KW PH PW S D G GCin GCout */

1761

b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 384, 64});

1762

b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 64, 256});

1763

b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 64, 256});

1764

/************************** Fire 9 *************************/

1765

/* N H W KH KW PH PW S D G GCin GCout */

1766

b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 64});

1767

//b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 64, 256});

1768

//b->Args({1, 13, 13, 3, 3, 2, 2, 1, 1, 1, 64, 256});

1769

/************************* Conv 10 *************************/

1770

/* N H W KH KW PH PW S D G GCin GCout */

1771

b->Args({1, 13, 13, 1, 1, 0, 0, 1, 1, 1, 512, 1000});

1772

}

1773

1774

static void InceptionV3(benchmark::internal::Benchmark* b) {

1775

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1776

1777

/* N H W KH KW PH PW S D G GCin GCout */

1778

b->Args({1, 299, 299, 3, 3, 0, 0, 2, 1, 1, 3, 32});

1779

b->Args({1, 149, 149, 3, 3, 0, 0, 1, 1, 1, 32, 32});

1780

b->Args({1, 147, 147, 3, 3, 2, 2, 1, 1, 1, 32, 64});

1781

b->Args({1, 73, 73, 1, 1, 0, 0, 1, 1, 1, 64, 80});

1782

b->Args({1, 73, 73, 3, 3, 0, 0, 1, 1, 1, 80, 192});

1783

b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 192, 64});

1784

b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 192, 48});

1785

b->Args({1, 35, 35, 5, 5, 4, 4, 1, 1, 1, 48, 64});

1786

b->Args({1, 35, 35, 3, 3, 2, 2, 1, 1, 1, 64, 96});

1787

b->Args({1, 35, 35, 3, 3, 2, 2, 1, 1, 1, 96, 96});

1788

b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 192, 32});

1789

b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 256, 64});

1790

b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 256, 48});

1791

b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 288, 64});

1792

b->Args({1, 35, 35, 1, 1, 0, 0, 1, 1, 1, 288, 48});

1793

b->Args({1, 35, 35, 3, 3, 0, 0, 2, 1, 1, 288, 384});

1794

b->Args({1, 35, 35, 3, 3, 0, 0, 2, 1, 1, 96, 96});

1795

b->Args({1, 17, 17, 1, 1, 0, 0, 1, 1, 1, 768, 192});

1796

b->Args({1, 17, 17, 1, 1, 0, 0, 1, 1, 1, 768, 128});

1797

b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 128, 128});

1798

b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 128, 192});

1799

b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 128, 128});

1800

b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 128, 192});

1801

b->Args({1, 17, 17, 1, 1, 0, 0, 1, 1, 1, 768, 160});

1802

b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 160, 160});

1803

b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 160, 192});

1804

b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 160, 160});

1805

b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 160, 192});

1806

b->Args({1, 17, 17, 1, 7, 0, 6, 1, 1, 1, 192, 192});

1807

b->Args({1, 17, 17, 7, 1, 6, 0, 1, 1, 1, 192, 192});

1808

b->Args({1, 17, 17, 3, 3, 0, 0, 2, 1, 1, 192, 320});

1809

b->Args({1, 17, 17, 3, 3, 0, 0, 2, 1, 1, 192, 192});

1810

b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 320});

1811

b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 384});

1812

b->Args({1, 8, 8, 1, 3, 0, 2, 1, 1, 1, 384, 384});

1813

b->Args({1, 8, 8, 3, 1, 2, 0, 1, 1, 1, 384, 384});

1814

b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 448});

1815

b->Args({1, 8, 8, 3, 3, 2, 2, 1, 1, 1, 448, 384});

1816

b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 1280, 192});

1817

b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 320});

1818

b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 384});

1819

b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 448});

1820

b->Args({1, 8, 8, 1, 1, 0, 0, 1, 1, 1, 2048, 192});

1821

b->Args({1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 2048, 1001});

1822

}

1823

1824

static void ResNet18(benchmark::internal::Benchmark* b) {

1825

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1826

1827

/************************* Conv 1 *************************/

1828

/* N H W KH KW PH PW S D G GCin GCout */

1829

b->Args({1, 224, 224, 7, 7, 6, 6, 2, 1, 1, 3, 64});

1830

/************************ Conv 2.X ************************/

1831

/* N H W KH KW PH PW S D G GCin GCout */

1832

b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 64, 64});

1833

/************************ Conv 3.X ************************/

1834

/* N H W KH KW PH PW S D G GCin GCout */

1835

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 1, 64, 128});

1836

b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 128, 128});

1837

b->Args({1, 56, 56, 1, 1, 0, 0, 2, 1, 1, 64, 128});

1838

/************************ Conv 4.X ************************/

1839

/* N H W KH KW PH PW S D G GCin GCout */

1840

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 1, 128, 256});

1841

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 1, 256, 256});

1842

b->Args({1, 28, 28, 1, 1, 0, 0, 2, 1, 1, 128, 256});

1843

/************************ Conv 5.X ************************/

1844

/* N H W KH KW PH PW S D G GCin GCout */

1845

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 1, 256, 512});

1846

b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 1, 512, 512});

1847

b->Args({1, 14, 14, 1, 1, 0, 0, 2, 1, 1, 256, 512});

1848

}

1849

1850

static void ResNet50(benchmark::internal::Benchmark* b) {

1851

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1852

1853

/************************* Conv 1 *************************/

1854

/* N H W KH KW PH PW S D G GCin GCout */

1855

b->Args({1, 224, 224, 7, 7, 6, 6, 2, 1, 1, 3, 64});

1856

/************************ Conv 2.1 ************************/

1857

/* N H W KH KW PH PW S D G GCin GCout */

1858

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 64});

1859

b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 64, 64});

1860

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 256});

1861

//b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 256});

1862

/************************ Conv 2.X ************************/

1863

/* N H W KH KW PH PW S D G GCin GCout */

1864

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 256, 64});

1865

//b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 64, 64});

1866

//b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 64, 256});

1867

/************************ Conv 3.1 ************************/

1868

/* N H W KH KW PH PW S D G GCin GCout */

1869

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 256, 128});

1870

b->Args({1, 56, 56, 3, 3, 2, 2, 2, 1, 1, 128, 128});

1871

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 128, 512});

1872

b->Args({1, 56, 56, 1, 1, 0, 0, 2, 1, 1, 256, 512});

1873

/************************ Conv 3.X ************************/

1874

/* N H W KH KW PH PW S D G GCin GCout */

1875

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 512, 128});

1876

b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 128, 128});

1877

//b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 128, 512});

1878

/************************ Conv 4.1 ************************/

1879

/* N H W KH KW PH PW S D G GCin GCout */

1880

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 512, 256});

1881

b->Args({1, 28, 28, 3, 3, 2, 2, 2, 1, 1, 256, 256});

1882

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 256, 1024});

1883

b->Args({1, 28, 28, 1, 1, 0, 0, 2, 1, 1, 512, 1024});

1884

/************************ Conv 4.X ************************/

1885

/* N H W KH KW PH PW S D G GCin GCout */

1886

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 1024, 256});

1887

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 1, 256, 256});

1888

//b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 256, 1024});

1889

/************************ Conv 5.1 ************************/

1890

/* N H W KH KW PH PW S D G GCin GCout */

1891

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 1024, 512});

1892

b->Args({1, 14, 14, 3, 3, 2, 2, 2, 1, 1, 512, 512});

1893

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 512, 2048});

1894

b->Args({1, 14, 14, 1, 1, 0, 0, 2, 1, 1, 1024, 2048});

1895

/************************ Conv 5.X ************************/

1896

/* N H W KH KW PH PW S D G GCin GCout */

1897

b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 2048, 512});

1898

b->Args({1, 7, 7, 3, 3, 2, 2, 1, 1, 1, 512, 512});

1899

//b->Args({1, 7, 7, 1, 1, 0, 0, 1, 1, 1, 512, 2048});

1900

}

1901

1902

static void VGG(benchmark::internal::Benchmark* b) {

1903

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1904

1905

/************************* Conv 1.1 ************************/

1906

/* N H W KH KW PH PW S D G GCin GCout */

1907

b->Args({1, 224, 224, 3, 3, 2, 2, 1, 1, 1, 3, 64});

1908

/************************* Conv 1.2 ************************/

1909

/* N H W KH KW PH PW S D G GCin GCout */

1910

b->Args({1, 224, 224, 3, 3, 2, 2, 1, 1, 1, 64, 64});

1911

1912

/************************* Conv 2.1 ************************/

1913

/* N H W KH KW PH PW S D G GCin GCout */

1914

b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 1, 64, 128});

1915

/************************* Conv 2.2 ************************/

1916

/* N H W KH KW PH PW S D G GCin GCout */

1917

b->Args({1, 112, 112, 3, 3, 2, 2, 1, 1, 1, 128, 128});

1918

1919

/************************* Conv 3.1 ************************/

1920

/* N H W KH KW PH PW S D G GCin GCout */

1921

b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 128, 256});

1922

/************************* Conv 3.2 ************************/

1923

/* N H W KH KW PH PW S D G GCin GCout */

1924

b->Args({1, 56, 56, 3, 3, 2, 2, 1, 1, 1, 256, 256});

1925

/************************* Conv 3.3 ************************/

1926

/* N H W KH KW PH PW S D G GCin GCout */

1927

b->Args({1, 56, 56, 1, 1, 0, 0, 1, 1, 1, 256, 256});

1928

1929

/************************* Conv 4.1 ************************/

1930

/* N H W KH KW PH PW S D G GCin GCout */

1931

b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 256, 512});

1932

/************************* Conv 4.2 ************************/

1933

/* N H W KH KW PH PW S D G GCin GCout */

1934

b->Args({1, 28, 28, 3, 3, 2, 2, 1, 1, 1, 512, 512});

1935

/************************* Conv 4.3 ************************/

1936

/* N H W KH KW PH PW S D G GCin GCout */

1937

b->Args({1, 28, 28, 1, 1, 0, 0, 1, 1, 1, 512, 512});

1938

1939

/************************* Conv 5.X ************************/

1940

/* N H W KH KW PH PW S D G GCin GCout */

1941

b->Args({1, 14, 14, 3, 3, 2, 2, 1, 1, 1, 512, 512});

1942

/************************* Conv 5.3 ************************/

1943

/* N H W KH KW PH PW S D G GCin GCout */

1944

b->Args({1, 14, 14, 1, 1, 0, 0, 1, 1, 1, 512, 512});

}

// SRCNN (9-1-5)

static void SRCNN915(benchmark::internal::Benchmark* b) {

1949

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1950

1951

/* N H W KH KW PH PW S D G GCin GCout */

1952

b->Args({1, 384, 384, 9, 9, 0, 0, 1, 1, 1, 1, 64});

1953

b->Args({1, 376, 376, 1, 1, 0, 0, 1, 1, 1, 64, 32});

1954

b->Args({1, 376, 376, 5, 5, 0, 0, 1, 1, 1, 32, 1});

}

// SRCNN (9-3-5)

static void SRCNN935(benchmark::internal::Benchmark* b) {

1959

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1960

1961

/* N H W KH KW PH PW S D G GCin GCout */

1962

b->Args({1, 384, 384, 9, 9, 0, 0, 1, 1, 1, 1, 64});

1963

b->Args({1, 376, 376, 3, 3, 0, 0, 1, 1, 1, 64, 32});

1964

b->Args({1, 374, 374, 5, 5, 0, 0, 1, 1, 1, 32, 1});

}

// SRCNN (9-5-5)

static void SRCNN955(benchmark::internal::Benchmark* b) {

1969

b->ArgNames({"N", "H", "W", "KH", "KW", "PH", "PW", "S", "D", "G", "GCin", "GCout"});

1970

1971

/* N H W KH KW PH PW S D G GCin GCout */

1972

b->Args({1, 384, 384, 9, 9, 0, 0, 1, 1, 1, 1, 64});

1973

b->Args({1, 376, 376, 5, 5, 0, 0, 1, 1, 1, 64, 32});

1974

b->Args({1, 372, 372, 5, 5, 0, 0, 1, 1, 1, 32, 1});

1975

}

1976

Chao Mei

2020-07-23 09:35:11 -0700

[diff] [blame]

1977

#ifndef XNN_NO_F16_OPERATORS

Marat Dukhan

2020-08-04 16:38:22 -0700

[diff] [blame]

1978

BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();

1979

BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();

1980

BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();

1981

BENCHMARK_CAPTURE(xnnpack_convolution_f16, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();

1982

BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();

1983

BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();

1984

BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();

1985

BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();

1986

BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();

1987

BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();

1988

BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();

1989

BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();

1990

BENCHMARK_CAPTURE(xnnpack_convolution_f16, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();

1991

BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();

1992

BENCHMARK_CAPTURE(xnnpack_convolution_f16, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();

1993

BENCHMARK_CAPTURE(xnnpack_convolution_f16, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();

1994

BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();

1995

BENCHMARK_CAPTURE(xnnpack_convolution_f16, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();

1996

BENCHMARK_CAPTURE(xnnpack_convolution_f16, vgg, "VGG")->Apply(VGG)->UseRealTime();

1997

BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();

1998

BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();

1999

BENCHMARK_CAPTURE(xnnpack_convolution_f16, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();

Chao Mei

2020-07-23 09:35:11 -0700

[diff] [blame]

2000

#endif // XNN_NO_F16_OPERATORS

Frank Barchard

2020-06-26 14:07:19 -0700

[diff] [blame]

2001

Marat Dukhan

2020-08-04 16:38:22 -0700

[diff] [blame]

2002

#ifndef XNN_NO_F32_OPERATORS

2003

BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();

2004

BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();

2005

BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();

2006

BENCHMARK_CAPTURE(xnnpack_convolution_f32, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();

2007

BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();

2008

BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();

2009

BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();

2010

BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();

2011

BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();

2012

BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();

2013

BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();

2014

BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();

2015

BENCHMARK_CAPTURE(xnnpack_convolution_f32, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();

2016

BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();

2017

BENCHMARK_CAPTURE(xnnpack_convolution_f32, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();

2018

BENCHMARK_CAPTURE(xnnpack_convolution_f32, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();

2019

BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();

2020

BENCHMARK_CAPTURE(xnnpack_convolution_f32, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();

2021

BENCHMARK_CAPTURE(xnnpack_convolution_f32, vgg, "VGG")->Apply(VGG)->UseRealTime();

2022

BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();

2023

BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();

2024

BENCHMARK_CAPTURE(xnnpack_convolution_f32, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();

2025

#endif // XNN_NO_F32_OPERATORS

2026

2027

#ifndef XNN_NO_QS8_OPERATORS

2028

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();

2029

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();

2030

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();

2031

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();

2032

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();

2033

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();

2034

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();

2035

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();

2036

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();

2037

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();

2038

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();

2039

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();

2040

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();

2041

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();

2042

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();

2043

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();

2044

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();

2045

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();

2046

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, vgg, "VGG")->Apply(VGG)->UseRealTime();

2047

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();

2048

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();

2049

BENCHMARK_CAPTURE(xnnpack_convolution_qs8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();

2050

#endif // XNN_NO_QS8_OPERATORS

XNNPACK Team

2019-09-27 18:10:33 -0700

[diff] [blame]

2051

Chao Mei

2020-07-23 09:35:11 -0700

[diff] [blame]

2052

#ifndef XNN_NO_QU8_OPERATORS

Marat Dukhan

2020-08-04 16:38:22 -0700

[diff] [blame]

2053

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v1, "MobileNet v1")->Apply(MobileNetV1)->UseRealTime();

2054

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v2, "MobileNet v2")->Apply(MobileNetV2)->UseRealTime();

2055

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v3_small, "MobileNet v3 Small")->Apply(MobileNetV3Small)->UseRealTime();

2056

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, mobilenet_v3_large, "MobileNet v3 Large")->Apply(MobileNetV3Large)->UseRealTime();

2057

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g1, "ShuffleNet v1 (1 group)")->Apply(ShuffleNetV1G1)->UseRealTime();

2058

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g2, "ShuffleNet v1 (2 groups)")->Apply(ShuffleNetV1G2)->UseRealTime();

2059

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g3, "ShuffleNet v1 (3 groups)")->Apply(ShuffleNetV1G3)->UseRealTime();

2060

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g4, "ShuffleNet v1 (4 groups)")->Apply(ShuffleNetV1G4)->UseRealTime();

2061

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v1_g8, "ShuffleNet v1 (8 groups)")->Apply(ShuffleNetV1G8)->UseRealTime();

2062

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x05, "ShuffleNet v2 0.5X")->Apply(ShuffleNetV2X05)->UseRealTime();

2063

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x10, "ShuffleNet v2 1.0X")->Apply(ShuffleNetV2X10)->UseRealTime();

2064

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x15, "ShuffleNet v2 1.5X")->Apply(ShuffleNetV2X15)->UseRealTime();

2065

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, shufflenet_v2_x20, "ShuffleNet v2 2.0X")->Apply(ShuffleNetV2X20)->UseRealTime();

2066

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, squeezenet_v10, "SqueezeNet 1.0")->Apply(SqueezeNetV10)->UseRealTime();

2067

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, squeezenet_v11, "SqueezeNet 1.1")->Apply(SqueezeNetV11)->UseRealTime();

2068

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, inception_v3, "Inception v3")->Apply(InceptionV3)->UseRealTime();

2069

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, resnet18, "ResNet-18")->Apply(ResNet18)->UseRealTime();

2070

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, resnet50, "ResNet-50")->Apply(ResNet50)->UseRealTime();

2071

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, vgg, "VGG")->Apply(VGG)->UseRealTime();

2072

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn915, "SRCNN (9-1-5)")->Apply(SRCNN915)->UseRealTime();

2073

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn935, "SRCNN (9-3-5)")->Apply(SRCNN935)->UseRealTime();

2074

BENCHMARK_CAPTURE(xnnpack_convolution_qu8, srcnn955, "SRCNN (9-5-5)")->Apply(SRCNN955)->UseRealTime();

Chao Mei

2020-07-23 09:35:11 -0700

[diff] [blame]

2075

#endif // XNN_NO_QU8_OPERATORS

XNNPACK Team