blob: d2db0b057b11d7d6a4c100971797e0d03e3fac47 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <cmath>
12#include <functional>
13#include <random>
14#include <vector>
15
16#include <cpuinfo.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070017
18#include <benchmark/benchmark.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070019#include "bench/utils.h"
20#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070021#include <xnnpack/common.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070022#include <xnnpack/requantization-stubs.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
24inline uint32_t divideRoundUp(uint32_t x, uint32_t q)
25{
26 return x / q + uint32_t(x % q != 0);
27}
28
29inline uint32_t roundUp(uint32_t x, uint32_t q)
30{
31 return q * divideRoundUp(x, q);
32}
33
34inline uint32_t min(uint32_t a, uint32_t b)
35{
36 return a < b ? a : b;
37}
38
39class Requantization : public benchmark::Fixture {
40 public:
41 inline Requantization()
42 {
43 cpuinfo_initialize();
44 const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size;
45 const size_t l1d_reserve = 1024;
46 n_ = (l1d_size - l1d_reserve) / (sizeof(int32_t) + sizeof(uint8_t));
47 n_ = n_ / 16 * 16;
48 }
49
50 virtual void SetUp(const benchmark::State&) override
51 {
52 std::random_device random_device;
53 auto rng = std::mt19937(random_device());
54 auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(), rng);
55
56 input_.resize(n());
57 std::generate(input_.begin(), input_.end(), std::ref(s32rng));
58 output_.resize(n());
59 std::fill(output_.begin(), output_.end(), 0xA5);
60 }
61
62 virtual void TearDown(benchmark::State& state) override
63 {
64 state.SetItemsProcessed(uint64_t(state.iterations()) * n());
65 state.SetBytesProcessed(uint64_t(state.iterations()) * n() * (sizeof(int32_t) + sizeof(uint8_t)));
66 input_.clear();
67 output_.clear();
68 }
69
70 inline const int32_t* input() const
71 {
72 return input_.data();
73 }
74
75 inline uint8_t* output()
76 {
77 return output_.data();
78 }
79
80 inline size_t n() const
81 {
82 return n_;
83 }
84
85 protected:
86 std::vector<int32_t, AlignedAllocator<int32_t, 32>> input_;
87 std::vector<uint8_t> output_;
88 size_t n_;
89};
90
91BENCHMARK_F(Requantization, precise__scalar_unsigned32)(benchmark::State& state)
92{
93 for (auto _ : state) {
94 xnn_requantize_precise__scalar_unsigned32(
95 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
96 }
97}
98
99BENCHMARK_F(Requantization, precise__scalar_unsigned64)(benchmark::State& state)
100{
101 for (auto _ : state) {
102 xnn_requantize_precise__scalar_unsigned64(
103 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
104 }
105}
106
107BENCHMARK_F(Requantization, precise__scalar_signed64)(benchmark::State& state)
108{
109 for (auto _ : state) {
110 xnn_requantize_precise__scalar_signed64(
111 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
112 }
113}
114
115BENCHMARK_F(Requantization, fp32__scalar_lrintf)(benchmark::State& state)
116{
117 for (auto _ : state) {
118 xnn_requantize_fp32__scalar_lrintf(
119 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
120 }
121}
122
123BENCHMARK_F(Requantization, fp32__scalar_magic)(benchmark::State& state)
124{
125 for (auto _ : state) {
126 xnn_requantize_fp32__scalar_magic(
127 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
128 }
129}
130
131BENCHMARK_F(Requantization, gemmlowp__scalar)(benchmark::State& state)
132{
133 for (auto _ : state) {
134 xnn_requantize_gemmlowp__scalar(
135 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
136 }
137}
138
139BENCHMARK_F(Requantization, precise__psimd)(benchmark::State& state)
140{
141 for (auto _ : state) {
142 xnn_requantize_precise__psimd(
143 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
144 }
145}
146
147BENCHMARK_F(Requantization, fp32__psimd)(benchmark::State& state)
148{
149 for (auto _ : state) {
150 xnn_requantize_fp32__psimd(
151 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
152 }
153}
154
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700155#if XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700156BENCHMARK_F(Requantization, precise__neon)(benchmark::State& state)
157{
158 for (auto _ : state) {
159 xnn_requantize_precise__neon(
160 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
161 }
162}
163
164BENCHMARK_F(Requantization, fp32__neon)(benchmark::State& state)
165{
166 for (auto _ : state) {
167 xnn_requantize_fp32__neon(
168 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
169 }
170}
171
172BENCHMARK_F(Requantization, q31__neon)(benchmark::State& state)
173{
174 for (auto _ : state) {
175 xnn_requantize_q31__neon(
176 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
177 }
178}
179
180BENCHMARK_F(Requantization, gemmlowp__neon)(benchmark::State& state)
181{
182 for (auto _ : state) {
183 xnn_requantize_gemmlowp__neon(
184 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
185 }
186}
187#endif
188
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700189#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700190BENCHMARK_F(Requantization, precise__sse2)(benchmark::State& state)
191{
192 for (auto _ : state) {
193 xnn_requantize_precise__sse2(
194 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
195 }
196}
197
198BENCHMARK_F(Requantization, precise__ssse3)(benchmark::State& state)
199{
200 for (auto _ : state) {
201 xnn_requantize_precise__ssse3(
202 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
203 }
204}
205
206BENCHMARK_F(Requantization, precise__sse4)(benchmark::State& state)
207{
208 for (auto _ : state) {
209 xnn_requantize_precise__sse4(
210 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
211 }
212}
213
214BENCHMARK_F(Requantization, fp32__sse2)(benchmark::State& state)
215{
216 for (auto _ : state) {
217 xnn_requantize_fp32__sse2(
218 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
219 }
220}
221
222BENCHMARK_F(Requantization, q31__sse2)(benchmark::State& state)
223{
224 for (auto _ : state) {
225 xnn_requantize_q31__sse2(
226 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
227 }
228}
229
230BENCHMARK_F(Requantization, q31__ssse3)(benchmark::State& state)
231{
232 for (auto _ : state) {
233 xnn_requantize_q31__ssse3(
234 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
235 }
236}
237
238BENCHMARK_F(Requantization, q31__sse4)(benchmark::State& state)
239{
240 for (auto _ : state) {
241 xnn_requantize_q31__sse4(
242 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
243 }
244}
245
246BENCHMARK_F(Requantization, gemmlowp__sse2)(benchmark::State& state)
247{
248 for (auto _ : state) {
249 xnn_requantize_gemmlowp__sse2(
250 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
251 }
252}
253
254BENCHMARK_F(Requantization, gemmlowp__ssse3)(benchmark::State& state)
255{
256 for (auto _ : state) {
257 xnn_requantize_gemmlowp__ssse3(
258 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
259 }
260}
261
262BENCHMARK_F(Requantization, gemmlowp__sse4)(benchmark::State& state)
263{
264 for (auto _ : state) {
265 xnn_requantize_gemmlowp__sse4(
266 n(), input(), 0x1.0p-12f /* scale */, 128 /* zero point */, 1 /* qmin */, 254 /* qmax */, output());
267 }
268}
269#endif
270
271#ifndef XNNPACK_BENCHMARK_NO_MAIN
272BENCHMARK_MAIN();
273#endif