blob: 399075f26b599c69547db654861460e7f2f87def [file] [log] [blame]
Marat Dukhan710fb422021-12-13 16:32:26 -08001// Copyright 2021 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <array>
8#include <cfloat>
9#include <cmath>
10#include <functional>
11#include <random>
12#include <vector>
13
14#include <xnnpack.h>
15
16#include <benchmark/benchmark.h>
Marat Dukhan98202342021-12-14 14:53:05 -080017#include <fp16/fp16.h>
Marat Dukhan710fb422021-12-13 16:32:26 -080018#include "bench/utils.h"
19#ifdef BENCHMARK_TENSORFLOW_LITE
20#include "flatbuffers/include/flatbuffers/flatbuffers.h"
21#include "tensorflow/lite/interpreter.h"
22#include "tensorflow/lite/kernels/register.h"
23#include "tensorflow/lite/model.h"
24#include "tensorflow/lite/schema/schema_generated.h"
25#include "tensorflow/lite/version.h"
26#endif // BENCHMARK_TENSORFLOW_LITE
27
28
Marat Dukhana0129e92021-12-30 15:59:28 -080029void xnnpack_convert_f16_f32(benchmark::State& state) {
Marat Dukhan98202342021-12-14 14:53:05 -080030 const size_t batch_size = state.range(0);
31
32 std::random_device random_device;
33 auto rng = std::mt19937(random_device());
34 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
35 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
36
37 std::vector<uint16_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint16_t));
38 std::generate(input.begin(), input.end(), std::ref(f16rng));
39 std::vector<float> output(batch_size);
40 std::fill(output.begin(), output.end(), std::nanf(""));
41
42 xnn_status status = xnn_initialize(nullptr /* allocator */);
43 if (status != xnn_status_success) {
44 state.SkipWithError("failed to initialize XNNPACK");
45 return;
46 }
47
48 xnn_operator_t convert_op = nullptr;
49 status = xnn_create_convert_nc_f16_f32(
50 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
51 0 /* flags */, &convert_op);
52 if (status != xnn_status_success) {
53 state.SkipWithError("failed to create F16->F32 Convert operator");
54 return;
55 }
56
57 status = xnn_setup_convert_nc_f16_f32(
58 convert_op, batch_size,
59 input.data(), output.data(),
60 nullptr /* thread pool */);
61 if (status != xnn_status_success) {
62 state.SkipWithError("failed to setup F16->F32 Convert operator");
63 return;
64 }
65
66 for (auto _ : state) {
67 status = xnn_run_operator(convert_op, nullptr /* thread pool */);
68 if (status != xnn_status_success) {
69 state.SkipWithError("failed to run F16->F32 Convert operator");
70 return;
71 }
72 }
73
74 status = xnn_delete_operator(convert_op);
75 if (status != xnn_status_success) {
76 state.SkipWithError("failed to delete F16->F32 Convert operator");
77 return;
78 }
79 convert_op = nullptr;
80
81 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
82 if (cpu_frequency != 0) {
83 state.counters["cpufreq"] = cpu_frequency;
84 }
85
86 state.counters["elements"] =
87 benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
88
89 const size_t bytes_per_iteration = batch_size * (sizeof(uint16_t) + sizeof(float));
90 state.counters["bytes"] =
91 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
92}
93
Marat Dukhana0129e92021-12-30 15:59:28 -080094void xnnpack_convert_f32_f16(benchmark::State& state) {
Marat Dukhan98202342021-12-14 14:53:05 -080095 const size_t batch_size = state.range(0);
96
97 std::random_device random_device;
98 auto rng = std::mt19937(random_device());
99 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
100
101 std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
102 std::generate(input.begin(), input.end(), std::ref(f32rng));
103 std::vector<uint16_t> output(batch_size);
104 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
105
106 xnn_status status = xnn_initialize(nullptr /* allocator */);
107 if (status != xnn_status_success) {
108 state.SkipWithError("failed to initialize XNNPACK");
109 return;
110 }
111
112 xnn_operator_t convert_op = nullptr;
113 status = xnn_create_convert_nc_f32_f16(
114 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
115 0 /* flags */, &convert_op);
116 if (status != xnn_status_success) {
117 state.SkipWithError("failed to create F32->F16 Convert operator");
118 return;
119 }
120
121 status = xnn_setup_convert_nc_f32_f16(
122 convert_op, batch_size,
123 input.data(), output.data(),
124 nullptr /* thread pool */);
125 if (status != xnn_status_success) {
126 state.SkipWithError("failed to setup F32->F16 Convert operator");
127 return;
128 }
129
130 for (auto _ : state) {
131 status = xnn_run_operator(convert_op, nullptr /* thread pool */);
132 if (status != xnn_status_success) {
133 state.SkipWithError("failed to run F32->F16 Convert operator");
134 return;
135 }
136 }
137
138 status = xnn_delete_operator(convert_op);
139 if (status != xnn_status_success) {
140 state.SkipWithError("failed to delete F32->F16 Convert operator");
141 return;
142 }
143 convert_op = nullptr;
144
145 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
146 if (cpu_frequency != 0) {
147 state.counters["cpufreq"] = cpu_frequency;
148 }
149
150 state.counters["elements"] =
151 benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
152
153 const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(uint16_t));
154 state.counters["bytes"] =
155 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
156}
157
Marat Dukhana0129e92021-12-30 15:59:28 -0800158void xnnpack_convert_f32_qs8(benchmark::State& state) {
Marat Dukhan710fb422021-12-13 16:32:26 -0800159 const size_t batch_size = state.range(0);
160
161 std::random_device random_device;
162 auto rng = std::mt19937(random_device());
163 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
164
165 std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
166 std::generate(input.begin(), input.end(), std::ref(f32rng));
167 std::vector<int8_t> output(batch_size);
168 std::fill(output.begin(), output.end(), 0);
169
170 xnn_status status = xnn_initialize(nullptr /* allocator */);
171 if (status != xnn_status_success) {
172 state.SkipWithError("failed to initialize XNNPACK");
173 return;
174 }
175
176 xnn_operator_t convert_op = nullptr;
177 status = xnn_create_convert_nc_f32_qs8(
178 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
179 1.0f / 128.0f /* scale */, 1 /* zero point */,
180 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
181 0 /* flags */, &convert_op);
182 if (status != xnn_status_success) {
183 state.SkipWithError("failed to create F32->QS8 Convert operator");
184 return;
185 }
186
187 status = xnn_setup_convert_nc_f32_qs8(
188 convert_op, batch_size,
189 input.data(), output.data(),
190 nullptr /* thread pool */);
191 if (status != xnn_status_success) {
192 state.SkipWithError("failed to setup F32->QS8 Convert operator");
193 return;
194 }
195
196 for (auto _ : state) {
197 status = xnn_run_operator(convert_op, nullptr /* thread pool */);
198 if (status != xnn_status_success) {
199 state.SkipWithError("failed to run F32->QS8 Convert operator");
200 return;
201 }
202 }
203
204 status = xnn_delete_operator(convert_op);
205 if (status != xnn_status_success) {
206 state.SkipWithError("failed to delete F32->QS8 Convert operator");
207 return;
208 }
209 convert_op = nullptr;
210
211 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
212 if (cpu_frequency != 0) {
213 state.counters["cpufreq"] = cpu_frequency;
214 }
215
216 state.counters["elements"] =
217 benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
218
Marat Dukhan98202342021-12-14 14:53:05 -0800219 const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(int8_t));
220 state.counters["bytes"] =
221 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
222}
223
Marat Dukhana0129e92021-12-30 15:59:28 -0800224void xnnpack_convert_f32_qu8(benchmark::State& state) {
Marat Dukhan98202342021-12-14 14:53:05 -0800225 const size_t batch_size = state.range(0);
226
227 std::random_device random_device;
228 auto rng = std::mt19937(random_device());
229 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
230
231 std::vector<float> input(batch_size + XNN_EXTRA_BYTES / sizeof(float));
232 std::generate(input.begin(), input.end(), std::ref(f32rng));
233 std::vector<uint8_t> output(batch_size);
234 std::fill(output.begin(), output.end(), 0);
235
236 xnn_status status = xnn_initialize(nullptr /* allocator */);
237 if (status != xnn_status_success) {
238 state.SkipWithError("failed to initialize XNNPACK");
239 return;
240 }
241
242 xnn_operator_t convert_op = nullptr;
243 status = xnn_create_convert_nc_f32_qu8(
244 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
245 1.0f / 128.0f /* scale */, 127 /* zero point */,
246 std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max(),
247 0 /* flags */, &convert_op);
248 if (status != xnn_status_success) {
249 state.SkipWithError("failed to create F32->QU8 Convert operator");
250 return;
251 }
252
253 status = xnn_setup_convert_nc_f32_qu8(
254 convert_op, batch_size,
255 input.data(), output.data(),
256 nullptr /* thread pool */);
257 if (status != xnn_status_success) {
258 state.SkipWithError("failed to setup F32->QU8 Convert operator");
259 return;
260 }
261
262 for (auto _ : state) {
263 status = xnn_run_operator(convert_op, nullptr /* thread pool */);
264 if (status != xnn_status_success) {
265 state.SkipWithError("failed to run F32->QU8 Convert operator");
266 return;
267 }
268 }
269
270 status = xnn_delete_operator(convert_op);
271 if (status != xnn_status_success) {
272 state.SkipWithError("failed to delete F32->QU8 Convert operator");
273 return;
274 }
275 convert_op = nullptr;
276
277 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
278 if (cpu_frequency != 0) {
279 state.counters["cpufreq"] = cpu_frequency;
280 }
281
282 state.counters["elements"] =
283 benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
284
285 const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(uint8_t));
286 state.counters["bytes"] =
287 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
288}
289
Marat Dukhana0129e92021-12-30 15:59:28 -0800290void xnnpack_convert_qs8_f32(benchmark::State& state) {
Marat Dukhan98202342021-12-14 14:53:05 -0800291 const size_t batch_size = state.range(0);
292
293 std::random_device random_device;
294 auto rng = std::mt19937(random_device());
295 auto i8rng = std::bind(
296 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
297 std::ref(rng));
298
299 std::vector<int8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(int8_t));
300 std::generate(input.begin(), input.end(), std::ref(i8rng));
301 std::vector<float> output(batch_size);
302 std::fill(output.begin(), output.end(), std::nanf(""));
303
304 xnn_status status = xnn_initialize(nullptr /* allocator */);
305 if (status != xnn_status_success) {
306 state.SkipWithError("failed to initialize XNNPACK");
307 return;
308 }
309
310 xnn_operator_t convert_op = nullptr;
311 status = xnn_create_convert_nc_qs8_f32(
312 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
313 1.0f / 255.0f /* scale */, -128 /* zero point */,
314 0 /* flags */, &convert_op);
315 if (status != xnn_status_success) {
316 state.SkipWithError("failed to create QS8->F32 Convert operator");
317 return;
318 }
319
320 status = xnn_setup_convert_nc_qs8_f32(
321 convert_op, batch_size,
322 input.data(), output.data(),
323 nullptr /* thread pool */);
324 if (status != xnn_status_success) {
325 state.SkipWithError("failed to setup QS8->F32 Convert operator");
326 return;
327 }
328
329 for (auto _ : state) {
330 status = xnn_run_operator(convert_op, nullptr /* thread pool */);
331 if (status != xnn_status_success) {
332 state.SkipWithError("failed to run QS8->F32 Convert operator");
333 return;
334 }
335 }
336
337 status = xnn_delete_operator(convert_op);
338 if (status != xnn_status_success) {
339 state.SkipWithError("failed to delete QS8->F32 Convert operator");
340 return;
341 }
342 convert_op = nullptr;
343
344 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
345 if (cpu_frequency != 0) {
346 state.counters["cpufreq"] = cpu_frequency;
347 }
348
349 state.counters["elements"] =
350 benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
351
Marat Dukhan710fb422021-12-13 16:32:26 -0800352 const size_t bytes_per_iteration = batch_size * (sizeof(int8_t) + sizeof(float));
353 state.counters["bytes"] =
354 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
355}
356
Marat Dukhana0129e92021-12-30 15:59:28 -0800357void xnnpack_convert_qu8_f32(benchmark::State& state) {
Marat Dukhan98202342021-12-14 14:53:05 -0800358 const size_t batch_size = state.range(0);
359
360 std::random_device random_device;
361 auto rng = std::mt19937(random_device());
362 auto u8rng = std::bind(
363 std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
364 std::ref(rng));
365
366 std::vector<uint8_t> input(batch_size + XNN_EXTRA_BYTES / sizeof(uint8_t));
367 std::generate(input.begin(), input.end(), std::ref(u8rng));
368 std::vector<float> output(batch_size);
369 std::fill(output.begin(), output.end(), std::nanf(""));
370
371 xnn_status status = xnn_initialize(nullptr /* allocator */);
372 if (status != xnn_status_success) {
373 state.SkipWithError("failed to initialize XNNPACK");
374 return;
375 }
376
377 xnn_operator_t convert_op = nullptr;
378 status = xnn_create_convert_nc_qu8_f32(
379 1 /* channels */, 1 /* input stride */, 1 /* output stride */,
380 1.0f / 128.0f /* scale */, 128 /* zero point */,
381 0 /* flags */, &convert_op);
382 if (status != xnn_status_success) {
383 state.SkipWithError("failed to create QU8->F32 Convert operator");
384 return;
385 }
386
387 status = xnn_setup_convert_nc_qu8_f32(
388 convert_op, batch_size,
389 input.data(), output.data(),
390 nullptr /* thread pool */);
391 if (status != xnn_status_success) {
392 state.SkipWithError("failed to setup QU8->F32 Convert operator");
393 return;
394 }
395
396 for (auto _ : state) {
397 status = xnn_run_operator(convert_op, nullptr /* thread pool */);
398 if (status != xnn_status_success) {
399 state.SkipWithError("failed to run QU8->F32 Convert operator");
400 return;
401 }
402 }
403
404 status = xnn_delete_operator(convert_op);
405 if (status != xnn_status_success) {
406 state.SkipWithError("failed to delete QU8->F32 Convert operator");
407 return;
408 }
409 convert_op = nullptr;
410
411 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
412 if (cpu_frequency != 0) {
413 state.counters["cpufreq"] = cpu_frequency;
414 }
415
416 state.counters["elements"] =
417 benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
418
419 const size_t bytes_per_iteration = batch_size * (sizeof(uint8_t) + sizeof(float));
420 state.counters["bytes"] =
421 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
422}
423
Marat Dukhan710fb422021-12-13 16:32:26 -0800424#ifdef BENCHMARK_TENSORFLOW_LITE
Marat Dukhana0129e92021-12-30 15:59:28 -0800425void tflite_convert_f16_f32(benchmark::State& state) {
Marat Dukhan98202342021-12-14 14:53:05 -0800426 const size_t batch_size = state.range(0);
427
428 std::random_device random_device;
429 auto rng = std::mt19937(random_device());
430 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
431 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
432
433 flatbuffers::FlatBufferBuilder builder;
434 flatbuffers::Offset<tflite::OperatorCode> operator_code =
435 CreateOperatorCode(builder, tflite::BuiltinOperator_DEQUANTIZE);
436
437 std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
438 tflite::CreateBuffer(builder, builder.CreateVector({})),
439 }};
440
441 const std::array<int32_t, 1> shape{{
442 static_cast<int32_t>(batch_size)
443 }};
444
445 const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
446 tflite::CreateTensor(builder,
447 builder.CreateVector<int32_t>(shape.data(), shape.size()),
448 tflite::TensorType_FLOAT16),
449 tflite::CreateTensor(builder,
450 builder.CreateVector<int32_t>(shape.data(), shape.size()),
451 tflite::TensorType_FLOAT32)
452 }};
453
454 const std::array<int32_t, 1> op_inputs{{0}};
455 const std::array<int32_t, 1> op_outputs{{1}};
456 flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
457 0 /* opcode_index */,
458 builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
459 builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
460
461 const std::array<int32_t, 1> graph_inputs{{0}};
462 const std::array<int32_t, 1> graph_outputs{{1}};
463 flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
464 builder,
465 builder.CreateVector(tensors.data(), tensors.size()),
466 builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
467 builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
468 builder.CreateVector(&op, 1));
469
470 flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Dequantize model");
471
472 flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
473 TFLITE_SCHEMA_VERSION,
474 builder.CreateVector(&operator_code, 1),
475 builder.CreateVector(&subgraph, 1),
476 description,
477 builder.CreateVector(buffers.data(), buffers.size()));
478
479 builder.Finish(model_buffer);
480
481 const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
482 tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
483 tflite::InterpreterBuilder interpreterBuilder(model, resolver);
484 std::unique_ptr<tflite::Interpreter> interpreter;
485 if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
486 state.SkipWithError("failed to create TFLite interpreter");
487 return;
488 }
489 interpreter->SetNumThreads(1);
490
491 if (interpreter->AllocateTensors() != kTfLiteOk) {
492 state.SkipWithError("failed to allocate tensors");
493 return;
494 }
495
496 uint16_t* input_data = reinterpret_cast<uint16_t*>(interpreter->tensor(0)->data.data);
497 std::generate(input_data, input_data + batch_size, std::ref(f16rng));
498
499 for (auto _ : state) {
500 if (interpreter->Invoke() != kTfLiteOk) {
501 state.SkipWithError("failed to invoke TFLite interpreter");
502 return;
503 }
504 }
505
506 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
507 if (cpu_frequency != 0) {
508 state.counters["cpufreq"] = cpu_frequency;
509 }
510
511 state.counters["elements"] =
512 benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
513
514 const size_t bytes_per_iteration = batch_size * (sizeof(uint16_t) + sizeof(float));
515 state.counters["bytes"] =
516 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
517
518 interpreter.reset();
519}
520
Marat Dukhana0129e92021-12-30 15:59:28 -0800521void tflite_convert_f32_qs8(benchmark::State& state) {
Marat Dukhan710fb422021-12-13 16:32:26 -0800522 const size_t batch_size = state.range(0);
523
524 std::random_device random_device;
525 auto rng = std::mt19937(random_device());
526 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
527
528 flatbuffers::FlatBufferBuilder builder;
529 flatbuffers::Offset<tflite::OperatorCode> operator_code =
530 CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
531
532 std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
533 tflite::CreateBuffer(builder, builder.CreateVector({})),
534 }};
535
536 const std::array<int32_t, 1> shape{{
537 static_cast<int32_t>(batch_size)
538 }};
539
540 const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
541 tflite::CreateTensor(builder,
542 builder.CreateVector<int32_t>(shape.data(), shape.size()),
543 tflite::TensorType_FLOAT32),
544 tflite::CreateTensor(builder,
545 builder.CreateVector<int32_t>(shape.data(), shape.size()),
546 tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
547 tflite::CreateQuantizationParameters(builder,
548 0 /*min*/, 0 /*max*/,
549 builder.CreateVector<float>({1.0f / 128.0f /* scale */}),
550 builder.CreateVector<int64_t>({1 /* zero point */})))
551 }};
552
553 const std::array<int32_t, 1> op_inputs{{0}};
554 const std::array<int32_t, 1> op_outputs{{1}};
555 flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
556 0 /* opcode_index */,
557 builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
558 builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
559
560 const std::array<int32_t, 1> graph_inputs{{0}};
561 const std::array<int32_t, 1> graph_outputs{{1}};
562 flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
563 builder,
564 builder.CreateVector(tensors.data(), tensors.size()),
565 builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
566 builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
567 builder.CreateVector(&op, 1));
568
569 flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
570
571 flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
572 TFLITE_SCHEMA_VERSION,
573 builder.CreateVector(&operator_code, 1),
574 builder.CreateVector(&subgraph, 1),
575 description,
576 builder.CreateVector(buffers.data(), buffers.size()));
577
578 builder.Finish(model_buffer);
579
580 const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
581 tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
582 tflite::InterpreterBuilder interpreterBuilder(model, resolver);
583 std::unique_ptr<tflite::Interpreter> interpreter;
584 if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
585 state.SkipWithError("failed to create TFLite interpreter");
586 return;
587 }
588 interpreter->SetNumThreads(1);
589
590 if (interpreter->AllocateTensors() != kTfLiteOk) {
591 state.SkipWithError("failed to allocate tensors");
592 return;
593 }
594
595 std::generate(
596 interpreter->typed_tensor<float>(0),
597 interpreter->typed_tensor<float>(0) + batch_size,
598 std::ref(f32rng));
599
600 for (auto _ : state) {
601 if (interpreter->Invoke() != kTfLiteOk) {
602 state.SkipWithError("failed to invoke TFLite interpreter");
603 return;
604 }
605 }
606
607 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
608 if (cpu_frequency != 0) {
609 state.counters["cpufreq"] = cpu_frequency;
610 }
611
612 state.counters["elements"] =
613 benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
614
Marat Dukhan98202342021-12-14 14:53:05 -0800615 const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(int8_t));
616 state.counters["bytes"] =
617 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
618
619 interpreter.reset();
620}
621
Marat Dukhana0129e92021-12-30 15:59:28 -0800622void tflite_convert_f32_qu8(benchmark::State& state) {
Marat Dukhan98202342021-12-14 14:53:05 -0800623 const size_t batch_size = state.range(0);
624
625 std::random_device random_device;
626 auto rng = std::mt19937(random_device());
627 auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), std::ref(rng));
628
629 flatbuffers::FlatBufferBuilder builder;
630 flatbuffers::Offset<tflite::OperatorCode> operator_code =
631 CreateOperatorCode(builder, tflite::BuiltinOperator_QUANTIZE);
632
633 std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
634 tflite::CreateBuffer(builder, builder.CreateVector({})),
635 }};
636
637 const std::array<int32_t, 1> shape{{
638 static_cast<int32_t>(batch_size)
639 }};
640
641 const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
642 tflite::CreateTensor(builder,
643 builder.CreateVector<int32_t>(shape.data(), shape.size()),
644 tflite::TensorType_FLOAT32),
645 tflite::CreateTensor(builder,
646 builder.CreateVector<int32_t>(shape.data(), shape.size()),
647 tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
648 tflite::CreateQuantizationParameters(builder,
649 0 /*min*/, 0 /*max*/,
650 builder.CreateVector<float>({1.0f / 128.0f /* scale */}),
651 builder.CreateVector<int64_t>({127 /* zero point */})))
652 }};
653
654 const std::array<int32_t, 1> op_inputs{{0}};
655 const std::array<int32_t, 1> op_outputs{{1}};
656 flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
657 0 /* opcode_index */,
658 builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
659 builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
660
661 const std::array<int32_t, 1> graph_inputs{{0}};
662 const std::array<int32_t, 1> graph_outputs{{1}};
663 flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
664 builder,
665 builder.CreateVector(tensors.data(), tensors.size()),
666 builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
667 builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
668 builder.CreateVector(&op, 1));
669
670 flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Quantize model");
671
672 flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
673 TFLITE_SCHEMA_VERSION,
674 builder.CreateVector(&operator_code, 1),
675 builder.CreateVector(&subgraph, 1),
676 description,
677 builder.CreateVector(buffers.data(), buffers.size()));
678
679 builder.Finish(model_buffer);
680
681 const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
682 tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
683 tflite::InterpreterBuilder interpreterBuilder(model, resolver);
684 std::unique_ptr<tflite::Interpreter> interpreter;
685 if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
686 state.SkipWithError("failed to create TFLite interpreter");
687 return;
688 }
689 interpreter->SetNumThreads(1);
690
691 if (interpreter->AllocateTensors() != kTfLiteOk) {
692 state.SkipWithError("failed to allocate tensors");
693 return;
694 }
695
696 std::generate(
697 interpreter->typed_tensor<float>(0),
698 interpreter->typed_tensor<float>(0) + batch_size,
699 std::ref(f32rng));
700
701 for (auto _ : state) {
702 if (interpreter->Invoke() != kTfLiteOk) {
703 state.SkipWithError("failed to invoke TFLite interpreter");
704 return;
705 }
706 }
707
708 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
709 if (cpu_frequency != 0) {
710 state.counters["cpufreq"] = cpu_frequency;
711 }
712
713 state.counters["elements"] =
714 benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
715
716 const size_t bytes_per_iteration = batch_size * (sizeof(float) + sizeof(uint8_t));
717 state.counters["bytes"] =
718 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
719
720 interpreter.reset();
721}
722
Marat Dukhana0129e92021-12-30 15:59:28 -0800723void tflite_convert_qs8_f32(benchmark::State& state) {
Marat Dukhan98202342021-12-14 14:53:05 -0800724 const size_t batch_size = state.range(0);
725
726 std::random_device random_device;
727 auto rng = std::mt19937(random_device());
728 auto i8rng = std::bind(
729 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
730 std::ref(rng));
731
732 flatbuffers::FlatBufferBuilder builder;
733 flatbuffers::Offset<tflite::OperatorCode> operator_code =
734 CreateOperatorCode(builder, tflite::BuiltinOperator_DEQUANTIZE);
735
736 std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
737 tflite::CreateBuffer(builder, builder.CreateVector({})),
738 }};
739
740 const std::array<int32_t, 1> shape{{
741 static_cast<int32_t>(batch_size)
742 }};
743
744 const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
745 tflite::CreateTensor(builder,
746 builder.CreateVector<int32_t>(shape.data(), shape.size()),
747 tflite::TensorType_INT8, 0 /* buffer */, 0 /* name */,
748 tflite::CreateQuantizationParameters(builder,
749 0 /*min*/, 0 /*max*/,
750 builder.CreateVector<float>({1.0f / 255.0f /* scale */}),
751 builder.CreateVector<int64_t>({-128 /* zero point */}))),
752 tflite::CreateTensor(builder,
753 builder.CreateVector<int32_t>(shape.data(), shape.size()),
754 tflite::TensorType_FLOAT32)
755 }};
756
757 const std::array<int32_t, 1> op_inputs{{0}};
758 const std::array<int32_t, 1> op_outputs{{1}};
759 flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
760 0 /* opcode_index */,
761 builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
762 builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
763
764 const std::array<int32_t, 1> graph_inputs{{0}};
765 const std::array<int32_t, 1> graph_outputs{{1}};
766 flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
767 builder,
768 builder.CreateVector(tensors.data(), tensors.size()),
769 builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
770 builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
771 builder.CreateVector(&op, 1));
772
773 flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Dequantize model");
774
775 flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
776 TFLITE_SCHEMA_VERSION,
777 builder.CreateVector(&operator_code, 1),
778 builder.CreateVector(&subgraph, 1),
779 description,
780 builder.CreateVector(buffers.data(), buffers.size()));
781
782 builder.Finish(model_buffer);
783
784 const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
785 tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
786 tflite::InterpreterBuilder interpreterBuilder(model, resolver);
787 std::unique_ptr<tflite::Interpreter> interpreter;
788 if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
789 state.SkipWithError("failed to create TFLite interpreter");
790 return;
791 }
792 interpreter->SetNumThreads(1);
793
794 if (interpreter->AllocateTensors() != kTfLiteOk) {
795 state.SkipWithError("failed to allocate tensors");
796 return;
797 }
798
799 std::generate(
800 interpreter->typed_tensor<int8_t>(0),
801 interpreter->typed_tensor<int8_t>(0) + batch_size,
802 std::ref(i8rng));
803
804 for (auto _ : state) {
805 if (interpreter->Invoke() != kTfLiteOk) {
806 state.SkipWithError("failed to invoke TFLite interpreter");
807 return;
808 }
809 }
810
811 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
812 if (cpu_frequency != 0) {
813 state.counters["cpufreq"] = cpu_frequency;
814 }
815
816 state.counters["elements"] =
817 benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
818
Marat Dukhan710fb422021-12-13 16:32:26 -0800819 const size_t bytes_per_iteration = batch_size * (sizeof(int8_t) + sizeof(float));
820 state.counters["bytes"] =
821 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
822
823 interpreter.reset();
824}
Marat Dukhan98202342021-12-14 14:53:05 -0800825
Marat Dukhana0129e92021-12-30 15:59:28 -0800826void tflite_convert_qu8_f32(benchmark::State& state) {
Marat Dukhan98202342021-12-14 14:53:05 -0800827 const size_t batch_size = state.range(0);
828
829 std::random_device random_device;
830 auto rng = std::mt19937(random_device());
831 auto u8rng = std::bind(
832 std::uniform_int_distribution<int32_t>(std::numeric_limits<uint8_t>::min(), std::numeric_limits<uint8_t>::max()),
833 std::ref(rng));
834
835 flatbuffers::FlatBufferBuilder builder;
836 flatbuffers::Offset<tflite::OperatorCode> operator_code =
837 CreateOperatorCode(builder, tflite::BuiltinOperator_DEQUANTIZE);
838
839 std::array<flatbuffers::Offset<tflite::Buffer>, 1> buffers{{
840 tflite::CreateBuffer(builder, builder.CreateVector({})),
841 }};
842
843 const std::array<int32_t, 1> shape{{
844 static_cast<int32_t>(batch_size)
845 }};
846
847 const std::array<flatbuffers::Offset<tflite::Tensor>, 2> tensors{{
848 tflite::CreateTensor(builder,
849 builder.CreateVector<int32_t>(shape.data(), shape.size()),
850 tflite::TensorType_UINT8, 0 /* buffer */, 0 /* name */,
851 tflite::CreateQuantizationParameters(builder,
852 0 /*min*/, 0 /*max*/,
853 builder.CreateVector<float>({1.0f / 128.0f /* scale */}),
854 builder.CreateVector<int64_t>({128 /* zero point */}))),
855 tflite::CreateTensor(builder,
856 builder.CreateVector<int32_t>(shape.data(), shape.size()),
857 tflite::TensorType_FLOAT32)
858 }};
859
860 const std::array<int32_t, 1> op_inputs{{0}};
861 const std::array<int32_t, 1> op_outputs{{1}};
862 flatbuffers::Offset<tflite::Operator> op = tflite::CreateOperator(builder,
863 0 /* opcode_index */,
864 builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
865 builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
866
867 const std::array<int32_t, 1> graph_inputs{{0}};
868 const std::array<int32_t, 1> graph_outputs{{1}};
869 flatbuffers::Offset<tflite::SubGraph> subgraph = tflite::CreateSubGraph(
870 builder,
871 builder.CreateVector(tensors.data(), tensors.size()),
872 builder.CreateVector<int32_t>(graph_inputs.data(), graph_inputs.size()),
873 builder.CreateVector<int32_t>(graph_outputs.data(), graph_outputs.size()),
874 builder.CreateVector(&op, 1));
875
876 flatbuffers::Offset<flatbuffers::String> description = builder.CreateString("Dequantize model");
877
878 flatbuffers::Offset<tflite::Model> model_buffer = tflite::CreateModel(builder,
879 TFLITE_SCHEMA_VERSION,
880 builder.CreateVector(&operator_code, 1),
881 builder.CreateVector(&subgraph, 1),
882 description,
883 builder.CreateVector(buffers.data(), buffers.size()));
884
885 builder.Finish(model_buffer);
886
887 const tflite::Model* model = tflite::GetModel(builder.GetBufferPointer());
888 tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates resolver;
889 tflite::InterpreterBuilder interpreterBuilder(model, resolver);
890 std::unique_ptr<tflite::Interpreter> interpreter;
891 if (interpreterBuilder(&interpreter) != kTfLiteOk || interpreter == nullptr) {
892 state.SkipWithError("failed to create TFLite interpreter");
893 return;
894 }
895 interpreter->SetNumThreads(1);
896
897 if (interpreter->AllocateTensors() != kTfLiteOk) {
898 state.SkipWithError("failed to allocate tensors");
899 return;
900 }
901
902 std::generate(
903 interpreter->typed_tensor<uint8_t>(0),
904 interpreter->typed_tensor<uint8_t>(0) + batch_size,
905 std::ref(u8rng));
906
907 for (auto _ : state) {
908 if (interpreter->Invoke() != kTfLiteOk) {
909 state.SkipWithError("failed to invoke TFLite interpreter");
910 return;
911 }
912 }
913
914 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
915 if (cpu_frequency != 0) {
916 state.counters["cpufreq"] = cpu_frequency;
917 }
918
919 state.counters["elements"] =
920 benchmark::Counter(uint64_t(state.iterations()) * batch_size, benchmark::Counter::kIsRate);
921
922 const size_t bytes_per_iteration = batch_size * (sizeof(uint8_t) + sizeof(float));
923 state.counters["bytes"] =
924 benchmark::Counter(uint64_t(state.iterations()) * bytes_per_iteration, benchmark::Counter::kIsRate);
925
926 interpreter.reset();
927}
Marat Dukhan710fb422021-12-13 16:32:26 -0800928#endif // BENCHMARK_TENSORFLOW_LITE
929
Marat Dukhana0129e92021-12-30 15:59:28 -0800930BENCHMARK(xnnpack_convert_f16_f32)
Marat Dukhan98202342021-12-14 14:53:05 -0800931 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
932 ->UseRealTime();
Marat Dukhana0129e92021-12-30 15:59:28 -0800933BENCHMARK(xnnpack_convert_f32_f16)
Marat Dukhan98202342021-12-14 14:53:05 -0800934 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint16_t>)
935 ->UseRealTime();
Marat Dukhana0129e92021-12-30 15:59:28 -0800936BENCHMARK(xnnpack_convert_f32_qs8)
Marat Dukhan98202342021-12-14 14:53:05 -0800937 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
938 ->UseRealTime();
Marat Dukhana0129e92021-12-30 15:59:28 -0800939BENCHMARK(xnnpack_convert_f32_qu8)
Marat Dukhan98202342021-12-14 14:53:05 -0800940 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
941 ->UseRealTime();
Marat Dukhana0129e92021-12-30 15:59:28 -0800942BENCHMARK(xnnpack_convert_qs8_f32)
Marat Dukhan710fb422021-12-13 16:32:26 -0800943 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
944 ->UseRealTime();
Marat Dukhana0129e92021-12-30 15:59:28 -0800945BENCHMARK(xnnpack_convert_qu8_f32)
Marat Dukhan98202342021-12-14 14:53:05 -0800946 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
947 ->UseRealTime();
Marat Dukhan710fb422021-12-13 16:32:26 -0800948
949#ifdef BENCHMARK_TENSORFLOW_LITE
Marat Dukhana0129e92021-12-30 15:59:28 -0800950 BENCHMARK(tflite_convert_f16_f32)
Marat Dukhan98202342021-12-14 14:53:05 -0800951 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint16_t, float>)
952 ->UseRealTime();
Marat Dukhana0129e92021-12-30 15:59:28 -0800953 BENCHMARK(tflite_convert_f32_qs8)
Marat Dukhan98202342021-12-14 14:53:05 -0800954 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, int8_t>)
955 ->UseRealTime();
Marat Dukhana0129e92021-12-30 15:59:28 -0800956 BENCHMARK(tflite_convert_f32_qu8)
Marat Dukhan98202342021-12-14 14:53:05 -0800957 ->Apply(benchmark::utils::UnaryElementwiseParameters<float, uint8_t>)
958 ->UseRealTime();
Marat Dukhana0129e92021-12-30 15:59:28 -0800959 BENCHMARK(tflite_convert_qs8_f32)
Marat Dukhan710fb422021-12-13 16:32:26 -0800960 ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, float>)
961 ->UseRealTime();
Marat Dukhana0129e92021-12-30 15:59:28 -0800962 BENCHMARK(tflite_convert_qu8_f32)
Marat Dukhan98202342021-12-14 14:53:05 -0800963 ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, float>)
964 ->UseRealTime();
Marat Dukhan710fb422021-12-13 16:32:26 -0800965#endif // BENCHMARK_TENSORFLOW_LITE
966
967#ifndef XNNPACK_BENCHMARK_NO_MAIN
968BENCHMARK_MAIN();
969#endif