blob: 24f8a749bf924dd56323e6b9dcd31d97292e2f5b [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <chrono>
12#include <cmath>
13#include <functional>
Marat Dukhan5ce30d92020-04-14 03:31:26 -070014#include <limits>
XNNPACK Teamb455b122019-09-27 18:10:33 -070015#include <mutex>
16#include <random>
17#include <vector>
18
XNNPACK Teamb455b122019-09-27 18:10:33 -070019#include <benchmark/benchmark.h>
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070020#ifdef BENCHMARK_GEMMLOWP
21#include "gemmlowp/public/gemmlowp.h"
22#endif // BENCHMARK_GEMMLOWP
23#ifdef BENCHMARK_RUY
Benoit Jacobb038fdc2020-03-25 12:14:20 -070024#include "ruy/ruy.h"
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070025#endif // BENCHMARK_RUY
Frank Barchardbb4c18b2019-09-30 11:05:52 -070026#include "bench/gemm.h"
27#include "bench/utils.h"
28#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070029#include <xnnpack/common.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070030#include <xnnpack/gemm.h>
31#include <xnnpack/pack.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070032#include <xnnpack/params-init.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070033#include <xnnpack/params.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070034
35
36static void GEMMBenchmark(benchmark::State& state,
Marat Dukhane3d17bf2021-05-24 22:22:43 -070037 xnn_qu8_gemm_minmax_ukernel_function gemm,
Marat Dukhane3d17bf2021-05-24 22:22:43 -070038 xnn_init_qu8_conv_minmax_params_fn init_params,
Marat Dukhand8e2d712021-07-26 23:35:50 -070039 size_t mr, size_t nr, size_t kr, size_t sr,
Marat Dukhan4e895872020-12-04 15:27:45 -080040 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -070041{
XNNPACK Teamb455b122019-09-27 18:10:33 -070042 const size_t mc = state.range(0);
43 const size_t nc = state.range(1);
44 const size_t kc = state.range(2);
45
Marat Dukhan42323232019-10-23 02:09:02 -070046 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
47 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070048
49 std::random_device random_device;
50 auto rng = std::mt19937(random_device());
Marat Dukhanecd83112020-08-03 21:50:28 -070051 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
Marat Dukhan44f0ca72020-08-02 21:46:58 -070052 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -070053
Marat Dukhan91351ef2021-08-04 16:32:28 -070054 std::vector<uint8_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint8_t));
XNNPACK Teamb455b122019-09-27 18:10:33 -070055 std::generate(a.begin(), a.end(), std::ref(u8rng));
56 std::vector<uint8_t> k(nc * kc);
57 std::generate(k.begin(), k.end(), std::ref(u8rng));
58 std::vector<int32_t> b(nc);
Marat Dukhanecd83112020-08-03 21:50:28 -070059 std::generate(b.begin(), b.end(), std::ref(i32rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -070060
61 const size_t w_elements = kc_stride * nc_stride + nc_stride * sizeof(int32_t) / sizeof(uint8_t);
62 const size_t c_elements = mc * nc;
63 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070064 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 sizeof(uint8_t) * (w_elements + c_elements));
66
Marat Dukhane13e6392021-07-26 22:22:35 -070067 std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> w(w_elements * num_buffers);
XNNPACK Teamb455b122019-09-27 18:10:33 -070068 std::fill(w.begin(), w.end(), 0);
Marat Dukhan08b7a972020-07-14 18:17:29 -070069 const xnn_qu8_packing_params packing_params = { 127, 127 };
Marat Dukhan0b043742021-06-02 18:29:11 -070070 xnn_pack_qu8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -070071 std::vector<uint8_t> c(c_elements * num_buffers);
72 std::fill(c.begin(), c.end(), 0xA5);
73
Marat Dukhane3d17bf2021-05-24 22:22:43 -070074 union xnn_qu8_conv_minmax_params quantization_params;
Marat Dukhan725f47e2021-05-22 10:06:19 -070075 init_params(&quantization_params, 127, 0.75f, 127, 1, 254);
XNNPACK Teamb455b122019-09-27 18:10:33 -070076
77 size_t buffer_index = 0;
78 for (auto _ : state) {
79 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
80 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
81 // - W is not in cache (for any cache level)
82 // - C is not in cache (for any cache level)
83 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -070084 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Teamb455b122019-09-27 18:10:33 -070085 buffer_index = (buffer_index + 1) % num_buffers;
86 state.ResumeTiming();
87
88 for (uint32_t m = 0; m < mc; m += mr) {
89 const uint32_t mb = min(mc - m, mr);
90 for (uint32_t n = 0; n < nc; n += nr) {
91 const uint32_t nb = min(nc - n, nr);
Marat Dukhan08b7a972020-07-14 18:17:29 -070092 gemm(
Marat Dukhanb1864632019-11-25 16:34:17 -080093 mb, nb, kc * sizeof(uint8_t),
XNNPACK Teamb455b122019-09-27 18:10:33 -070094 a.data() + m * kc, kc * sizeof(uint8_t),
95 w.data() + (w_elements * buffer_index + n * (kc_stride + sizeof(int32_t))) / sizeof(uint8_t),
96 c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint8_t), nr * sizeof(uint8_t),
Marat Dukhanb42f8662020-07-06 20:46:13 -070097 &quantization_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -070098 }
99 }
100 }
101
Marat Dukhand713e8a2020-12-04 14:23:12 -0800102 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
103 if (cpu_frequency != 0) {
104 state.counters["cpufreq"] = cpu_frequency;
105 }
106
XNNPACK Teamb455b122019-09-27 18:10:33 -0700107 state.counters["OPS"] = benchmark::Counter(
108 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
109}
110
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700111#ifdef BENCHMARK_GEMMLOWP
XNNPACK Teamb455b122019-09-27 18:10:33 -0700112struct GemmlowpOutputPipeline {
113 typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
114 typedef std::tuple<
115 gemmlowp::OutputStageBiasAddition<ColVectorMap>,
116 gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
117 gemmlowp::OutputStageClamp,
118 gemmlowp::OutputStageSaturatingCastToUint8>
119 Pipeline;
120
121 static Pipeline Make(
122 const int32_t* bias_data,
123 int output_rows,
124 int32_t output_offset,
125 int32_t output_multiplier,
126 int output_shift,
127 int32_t output_activation_min,
128 int32_t output_activation_max)
129 {
130 ColVectorMap bias_vector(bias_data, output_rows);
131 gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
132 bias_addition_stage.bias_vector = bias_vector;
133 gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint quantize_down_stage;
134 quantize_down_stage.result_offset_after_shift = output_offset;
135 quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
136 quantize_down_stage.result_shift = output_shift;
137 gemmlowp::OutputStageClamp clamp_stage;
138 clamp_stage.min = output_activation_min;
139 clamp_stage.max = output_activation_max;
140 gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
141 return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage, saturating_cast_stage);
142 }
143};
144
145static void GemmlowpBenchmark(benchmark::State& state, uint32_t threads)
146{
XNNPACK Teamb455b122019-09-27 18:10:33 -0700147 const size_t mc = state.range(0);
148 const size_t nc = state.range(1);
149 const size_t kc = state.range(2);
150
151 std::random_device random_device;
152 auto rng = std::mt19937(random_device());
Marat Dukhanecd83112020-08-03 21:50:28 -0700153 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
Marat Dukhan44f0ca72020-08-02 21:46:58 -0700154 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700155
156 std::vector<uint8_t> a(mc * kc);
157 std::generate(a.begin(), a.end(), std::ref(u8rng));
158
159 const size_t kElements = nc * kc;
160 const size_t bElements = nc;
161 const size_t c_elements = mc * nc;
162 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700163 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700164 kElements * sizeof(uint8_t) + bElements * sizeof(int32_t) + c_elements * sizeof(uint8_t));
165
166 std::vector<uint8_t> k(kElements * num_buffers);
167 std::generate(k.begin(), k.end(), std::ref(u8rng));
168 std::vector<int32_t> b(bElements * num_buffers);
Marat Dukhanecd83112020-08-03 21:50:28 -0700169 std::generate(b.begin(), b.end(), std::ref(i32rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700170 std::vector<uint8_t> c(c_elements * num_buffers);
171 std::fill(c.begin(), c.end(), 0xA5);
172
173 gemmlowp::MultiThreadGemmContext threadingContext;
174 threadingContext.set_max_num_threads(threads);
175
176 size_t buffer_index = 0;
177 for (auto _ : state) {
178 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700179 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700180 buffer_index = (buffer_index + 1) % num_buffers;
181 state.ResumeTiming();
182
183 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> AM(a.data(), mc, kc, kc);
184 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> BM(k.data() + buffer_index * kElements, kc, nc, kc);
185 gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::RowMajor> CM(c.data() + buffer_index * c_elements, mc, nc, nc);
186 const auto& outputPipeline = GemmlowpOutputPipeline::Make(b.data() + buffer_index * bElements, nc, 127, 127, 127, 0, 255);
187 gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
188 &threadingContext, AM, BM, &CM, 127, 127, outputPipeline);
189 }
190
Marat Dukhand713e8a2020-12-04 14:23:12 -0800191 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
192 if (cpu_frequency != 0) {
193 state.counters["cpufreq"] = cpu_frequency;
194 }
195
XNNPACK Teamb455b122019-09-27 18:10:33 -0700196 state.counters["OPS"] = benchmark::Counter(
197 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
198}
199
200static void gemmlowp_st(benchmark::State& state, const char* net)
201{
202 GemmlowpBenchmark(state, 1);
203}
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700204#endif // BENCHMARK_GEMMLOWP
XNNPACK Teamb455b122019-09-27 18:10:33 -0700205
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700206
207#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700208static void RuyBenchmark(benchmark::State& state, size_t threads)
209{
210 const size_t mc = state.range(0);
211 const size_t nc = state.range(1);
212 const size_t kc = state.range(2);
213
214 std::random_device random_device;
215 auto rng = std::mt19937(random_device());
Marat Dukhanecd83112020-08-03 21:50:28 -0700216 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
Marat Dukhan44f0ca72020-08-02 21:46:58 -0700217 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700218
219 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700220 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700221 nc * (sizeof(uint8_t) * (mc + kc) + sizeof(int32_t)));
222
223 std::vector<uint8_t> a(mc * kc);
224 std::generate(a.begin(), a.end(), std::ref(u8rng));
225 std::vector<uint8_t> k(num_buffers * nc * kc);
226 std::generate(k.begin(), k.end(), std::ref(u8rng));
227 std::vector<int32_t> b(num_buffers * nc);
Marat Dukhanecd83112020-08-03 21:50:28 -0700228 std::generate(b.begin(), b.end(), std::ref(i32rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700229 std::vector<uint8_t> c(num_buffers * nc * mc);
230 std::fill(c.begin(), c.end(), std::nanf(""));
231
232 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
233 static ruy::Context context;
Benoit Jacob349701a2020-04-15 19:35:24 -0700234 context.set_max_num_threads(threads);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700235
236 ruy::Matrix<uint8_t> ruy_a;
Benoit Jacob349701a2020-04-15 19:35:24 -0700237 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
238 ruy_a.set_zero_point(127);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700239 ruy::Matrix<uint8_t> ruy_b;
Benoit Jacob349701a2020-04-15 19:35:24 -0700240 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
241 ruy_b.set_data(a.data());
242 ruy_b.set_zero_point(127);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700243 ruy::Matrix<uint8_t> ruy_c;
Benoit Jacob349701a2020-04-15 19:35:24 -0700244 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
245 ruy_c.set_zero_point(127);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700246
Benoit Jacobb026e222020-04-16 12:30:03 -0700247 ruy::MulParams<int32_t, uint8_t> mul_params;
248 mul_params.set_multiplier_fixedpoint(0x40000000);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700249
250 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
251 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
252 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
253 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
254 static std::once_flag warmup;
255 std::call_once(warmup, [&](){
256 auto start = std::chrono::steady_clock::now();
257 do {
Benoit Jacob349701a2020-04-15 19:35:24 -0700258 ruy_a.set_data(k.data());
259 ruy_c.set_data(c.data());
Benoit Jacobb026e222020-04-16 12:30:03 -0700260 mul_params.set_bias(b.data());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700261
Benoit Jacobb026e222020-04-16 12:30:03 -0700262 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700263 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
264 });
265
266 size_t buffer_index = 0;
267 for (auto _ : state) {
268 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
269 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
270 // - K is not in cache (for any cache level)
271 // - B is not in cache (for any cache level)
272 // - C is not in cache (for any cache level)
273 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700274 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700275 buffer_index = (buffer_index + 1) % num_buffers;
276 state.ResumeTiming();
277
Benoit Jacob349701a2020-04-15 19:35:24 -0700278 ruy_a.set_data(k.data() + buffer_index * nc * kc);
279 ruy_c.set_data(c.data() + buffer_index * mc * nc);
Benoit Jacobb026e222020-04-16 12:30:03 -0700280 mul_params.set_bias(b.data() + buffer_index * nc);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700281
Benoit Jacobb026e222020-04-16 12:30:03 -0700282 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700283 }
284
Marat Dukhand713e8a2020-12-04 14:23:12 -0800285 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
286 if (cpu_frequency != 0) {
287 state.counters["cpufreq"] = cpu_frequency;
288 }
289
XNNPACK Teamb455b122019-09-27 18:10:33 -0700290 state.counters["OPS"] = benchmark::Counter(
291 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
292}
293
294static void ruy_st(benchmark::State& state, const char* net)
295{
296 RuyBenchmark(state, 1);
297}
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700298#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700299
300
Frank Barchard4c3e5a92021-08-16 19:17:39 -0700301#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard0049e892021-08-22 09:37:21 -0700302 static void qu8_gemm_4x8c4__aarch64_neondot_ld64(benchmark::State& state, const char* net) {
303 GEMMBenchmark(state,
304 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld64,
305 xnn_init_qu8_conv_minmax_rndnu_neon_params,
306 4, 8, 4, 1,
307 benchmark::utils::CheckNEONDOT);
308 }
Frank Barchard4c3e5a92021-08-16 19:17:39 -0700309 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
310 GEMMBenchmark(state,
311 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
312 xnn_init_qu8_conv_minmax_rndnu_neon_params,
313 4, 16, 1, 1,
314 benchmark::utils::CheckNEON);
315 }
316 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
317 GEMMBenchmark(state,
318 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
319 xnn_init_qu8_conv_minmax_rndnu_neon_params,
320 4, 16, 1, 1,
321 benchmark::utils::CheckNEON);
322 }
323 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State& state, const char* net) {
324 GEMMBenchmark(state,
325 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
326 xnn_init_qu8_conv_minmax_rndnu_neon_params,
327 4, 16, 1, 1,
328 benchmark::utils::CheckNEON);
329 }
330 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State& state, const char* net) {
331 GEMMBenchmark(state,
332 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
333 xnn_init_qu8_conv_minmax_rndnu_neon_params,
334 4, 16, 1, 1,
335 benchmark::utils::CheckNEON);
336 }
Frank Barchard0049e892021-08-22 09:37:21 -0700337 BENCHMARK_GEMM(qu8_gemm_4x8c4__aarch64_neondot_ld64)
Frank Barchard4c3e5a92021-08-16 19:17:39 -0700338 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
339 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
340 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75)
341 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75)
342#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
343
Frank Barchard0049e892021-08-22 09:37:21 -0700344
Marat Dukhand8e2d712021-07-26 23:35:50 -0700345#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard88e839c2021-08-11 00:12:31 -0700346 static void qu8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700347 GEMMBenchmark(state,
348 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
349 xnn_init_qu8_conv_minmax_rndnu_neon_params,
350 1, 8, 4, 1, benchmark::utils::CheckNEONDOT);
351 }
352 static void qu8_gemm_2x8c4__neondot(benchmark::State& state, const char* net) {
353 GEMMBenchmark(state,
354 xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot,
355 xnn_init_qu8_conv_minmax_rndnu_neon_params,
356 2, 8, 4, 1, benchmark::utils::CheckNEONDOT);
357 }
358 static void qu8_gemm_3x8c4__neondot(benchmark::State& state, const char* net) {
359 GEMMBenchmark(state,
360 xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot,
361 xnn_init_qu8_conv_minmax_rndnu_neon_params,
362 3, 8, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700363 }
364 static void qu8_gemm_4x8c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700365 GEMMBenchmark(state,
366 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
367 xnn_init_qu8_conv_minmax_rndnu_neon_params,
368 4, 8, 4, 1, benchmark::utils::CheckNEONDOT);
369 }
370 static void qu8_gemm_5x8c4__neondot(benchmark::State& state, const char* net) {
371 GEMMBenchmark(state,
372 xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot,
373 xnn_init_qu8_conv_minmax_rndnu_neon_params,
374 5, 8, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700375 }
376 static void qu8_gemm_6x8c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700377 GEMMBenchmark(state,
378 xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
379 xnn_init_qu8_conv_minmax_rndnu_neon_params,
380 6, 8, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700381 }
382 static void qu8_gemm_8x8c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700383 GEMMBenchmark(state,
384 xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
385 xnn_init_qu8_conv_minmax_rndnu_neon_params,
386 8, 8, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700387 }
388 static void qu8_gemm_1x16c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700389 GEMMBenchmark(state,
390 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
391 xnn_init_qu8_conv_minmax_rndnu_neon_params,
392 1, 16, 4, 1, benchmark::utils::CheckNEONDOT);
393 }
394 static void qu8_gemm_2x16c4__neondot(benchmark::State& state, const char* net) {
395 GEMMBenchmark(state,
396 xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot,
397 xnn_init_qu8_conv_minmax_rndnu_neon_params,
398 2, 16, 4, 1, benchmark::utils::CheckNEONDOT);
399 }
400 static void qu8_gemm_3x16c4__neondot(benchmark::State& state, const char* net) {
401 GEMMBenchmark(state,
402 xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot,
403 xnn_init_qu8_conv_minmax_rndnu_neon_params,
404 3, 16, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700405 }
406 static void qu8_gemm_4x16c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700407 GEMMBenchmark(state,
408 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
409 xnn_init_qu8_conv_minmax_rndnu_neon_params,
410 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
411 }
412 static void qu8_gemm_5x16c4__neondot(benchmark::State& state, const char* net) {
413 GEMMBenchmark(state,
414 xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot,
415 xnn_init_qu8_conv_minmax_rndnu_neon_params,
416 5, 16, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700417 }
418 static void qu8_gemm_6x16c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700419 GEMMBenchmark(state,
420 xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
421 xnn_init_qu8_conv_minmax_rndnu_neon_params,
422 6, 16, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700423 }
424 static void qu8_gemm_8x16c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700425 GEMMBenchmark(state,
426 xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
427 xnn_init_qu8_conv_minmax_rndnu_neon_params,
428 8, 16, 4, 1, benchmark::utils::CheckNEONDOT);
429 }
430 static void qu8_gemm_1x8__neon_mlal_lane(benchmark::State& state, const char* net) {
431 GEMMBenchmark(state,
432 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
433 xnn_init_qu8_conv_minmax_rndnu_neon_params,
434 1, 8, 1, 1, benchmark::utils::CheckNEON);
Frank Barchard88e839c2021-08-11 00:12:31 -0700435 }
Marat Dukhand8e2d712021-07-26 23:35:50 -0700436 static void qu8_gemm_1x16__neon_mlal_lane(benchmark::State& state, const char* net) {
437 GEMMBenchmark(state,
Marat Dukhanafd2ed92021-07-27 00:21:05 -0700438 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
439 xnn_init_qu8_conv_minmax_rndnu_neon_params,
Frank Barcharde0331262021-08-11 23:18:59 -0700440 1, 16, 1, 1, benchmark::utils::CheckNEON);
441 }
442 static void qu8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) {
443 GEMMBenchmark(state,
444 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
445 xnn_init_qu8_conv_minmax_rndnu_neon_params,
446 4, 8, 1, 1, benchmark::utils::CheckNEON);
Marat Dukhand8e2d712021-07-26 23:35:50 -0700447 }
448 static void qu8_gemm_4x16__neon_mlal_lane(benchmark::State& state, const char* net) {
449 GEMMBenchmark(state,
Marat Dukhanafd2ed92021-07-27 00:21:05 -0700450 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
451 xnn_init_qu8_conv_minmax_rndnu_neon_params,
Frank Barcharde0331262021-08-11 23:18:59 -0700452 4, 16, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700453 }
454
Frank Barchard88e839c2021-08-11 00:12:31 -0700455 BENCHMARK_GEMM(qu8_gemm_1x8c4__neondot)
Frank Barcharde0331262021-08-11 23:18:59 -0700456 BENCHMARK_GEMM(qu8_gemm_2x8c4__neondot)
457 BENCHMARK_GEMM(qu8_gemm_3x8c4__neondot)
Frank Barchard88e839c2021-08-11 00:12:31 -0700458 BENCHMARK_GEMM(qu8_gemm_4x8c4__neondot)
Frank Barcharde0331262021-08-11 23:18:59 -0700459 BENCHMARK_GEMM(qu8_gemm_5x8c4__neondot)
Frank Barchard88e839c2021-08-11 00:12:31 -0700460 BENCHMARK_GEMM(qu8_gemm_6x8c4__neondot)
461 BENCHMARK_GEMM(qu8_gemm_8x8c4__neondot)
462 BENCHMARK_GEMM(qu8_gemm_1x16c4__neondot)
Frank Barcharde0331262021-08-11 23:18:59 -0700463 BENCHMARK_GEMM(qu8_gemm_2x16c4__neondot)
464 BENCHMARK_GEMM(qu8_gemm_3x16c4__neondot)
Frank Barchard88e839c2021-08-11 00:12:31 -0700465 BENCHMARK_GEMM(qu8_gemm_4x16c4__neondot)
Frank Barcharde0331262021-08-11 23:18:59 -0700466 BENCHMARK_GEMM(qu8_gemm_5x16c4__neondot)
Frank Barchard88e839c2021-08-11 00:12:31 -0700467 BENCHMARK_GEMM(qu8_gemm_6x16c4__neondot)
468 BENCHMARK_GEMM(qu8_gemm_8x16c4__neondot)
Frank Barcharde0331262021-08-11 23:18:59 -0700469 BENCHMARK_GEMM(qu8_gemm_1x8__neon_mlal_lane)
Marat Dukhand8e2d712021-07-26 23:35:50 -0700470 BENCHMARK_GEMM(qu8_gemm_1x16__neon_mlal_lane)
Frank Barcharde0331262021-08-11 23:18:59 -0700471 BENCHMARK_GEMM(qu8_gemm_4x8__neon_mlal_lane)
Marat Dukhand8e2d712021-07-26 23:35:50 -0700472 BENCHMARK_GEMM(qu8_gemm_4x16__neon_mlal_lane)
473#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
474
Marat Dukhand8e2d712021-07-26 23:35:50 -0700475#if XNN_ARCH_X86 || XNN_ARCH_X86_64
476 static void qu8_gemm_1x16c8__avx512skx(benchmark::State& state, const char* net) {
477 GEMMBenchmark(state,
478 xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
479 xnn_init_qu8_conv_minmax_fp32_avx512_params,
480 1, 16, 8, 1,
481 benchmark::utils::CheckAVX512SKX);
482 }
483 static void qu8_gemm_2x16c8__avx512skx(benchmark::State& state, const char* net) {
484 GEMMBenchmark(state,
485 xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
486 xnn_init_qu8_conv_minmax_fp32_avx512_params,
487 2, 16, 8, 1,
488 benchmark::utils::CheckAVX512SKX);
489 }
490 static void qu8_gemm_3x16c8__avx512skx(benchmark::State& state, const char* net) {
491 GEMMBenchmark(state,
492 xnn_qu8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
493 xnn_init_qu8_conv_minmax_fp32_avx512_params,
494 3, 16, 8, 1,
495 benchmark::utils::CheckAVX512SKX);
496 }
497 static void qu8_gemm_4x16c8__avx512skx(benchmark::State& state, const char* net) {
498 GEMMBenchmark(state,
499 xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
500 xnn_init_qu8_conv_minmax_fp32_avx512_params,
501 4, 16, 8, 1,
502 benchmark::utils::CheckAVX512SKX);
503 }
504 static void qu8_gemm_1x8c8__avx2(benchmark::State& state, const char* net) {
505 GEMMBenchmark(state,
506 xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
507 xnn_init_qu8_conv_minmax_fp32_avx2_params,
508 1, 8, 8, 1,
509 benchmark::utils::CheckAVX2);
510 }
511 static void qu8_gemm_2x8c8__avx2(benchmark::State& state, const char* net) {
512 GEMMBenchmark(state,
513 xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
514 xnn_init_qu8_conv_minmax_fp32_avx2_params,
515 2, 8, 8, 1,
516 benchmark::utils::CheckAVX2);
517 }
518 static void qu8_gemm_3x8c8__avx2(benchmark::State& state, const char* net) {
519 GEMMBenchmark(state,
520 xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
521 xnn_init_qu8_conv_minmax_fp32_avx2_params,
522 3, 8, 8, 1,
523 benchmark::utils::CheckAVX2);
524 }
525 static void qu8_gemm_1x4c2__xop_ld64(benchmark::State& state, const char* net) {
526 GEMMBenchmark(state,
527 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
528 xnn_init_qu8_conv_minmax_fp32_sse2_params,
529 1, 4, 2, 1,
530 benchmark::utils::CheckXOP);
531 }
532 static void qu8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
533 GEMMBenchmark(state,
534 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
535 xnn_init_qu8_conv_minmax_fp32_sse2_params,
536 2, 4, 2, 1,
537 benchmark::utils::CheckXOP);
538 }
539 static void qu8_gemm_3x4c2__xop_ld64(benchmark::State& state, const char* net) {
540 GEMMBenchmark(state,
541 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
542 xnn_init_qu8_conv_minmax_fp32_sse2_params,
543 3, 4, 2, 1,
544 benchmark::utils::CheckXOP);
545 }
546 static void qu8_gemm_4x4c2__xop_ld64(benchmark::State& state, const char* net) {
547 GEMMBenchmark(state,
548 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
549 xnn_init_qu8_conv_minmax_fp32_sse2_params,
550 4, 4, 2, 1,
551 benchmark::utils::CheckXOP);
552 }
553 static void qu8_gemm_1x4c2__xop_ld128(benchmark::State& state, const char* net) {
554 GEMMBenchmark(state,
555 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
556 xnn_init_qu8_conv_minmax_fp32_sse2_params,
557 1, 4, 2, 1,
558 benchmark::utils::CheckXOP);
559 }
560 static void qu8_gemm_2x4c2__xop_ld128(benchmark::State& state, const char* net) {
561 GEMMBenchmark(state,
562 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
563 xnn_init_qu8_conv_minmax_fp32_sse2_params,
564 2, 4, 2, 1,
565 benchmark::utils::CheckXOP);
566 }
567 static void qu8_gemm_3x4c2__xop_ld128(benchmark::State& state, const char* net) {
568 GEMMBenchmark(state,
569 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
570 xnn_init_qu8_conv_minmax_fp32_sse2_params,
571 3, 4, 2, 1,
572 benchmark::utils::CheckXOP);
573 }
574 static void qu8_gemm_4x4c2__xop_ld128(benchmark::State& state, const char* net) {
575 GEMMBenchmark(state,
576 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
577 xnn_init_qu8_conv_minmax_fp32_sse2_params,
578 4, 4, 2, 1,
579 benchmark::utils::CheckXOP);
580 }
581 static void qu8_gemm_1x4c8__xop_ld64(benchmark::State& state, const char* net) {
582 GEMMBenchmark(state,
583 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
584 xnn_init_qu8_conv_minmax_fp32_sse2_params,
585 1, 4, 8, 1,
586 benchmark::utils::CheckXOP);
587 }
588 static void qu8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
589 GEMMBenchmark(state,
590 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
591 xnn_init_qu8_conv_minmax_fp32_sse2_params,
592 2, 4, 8, 1,
593 benchmark::utils::CheckXOP);
594 }
595 static void qu8_gemm_3x4c8__xop_ld64(benchmark::State& state, const char* net) {
596 GEMMBenchmark(state,
597 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
598 xnn_init_qu8_conv_minmax_fp32_sse2_params,
599 3, 4, 8, 1,
600 benchmark::utils::CheckXOP);
601 }
602 static void qu8_gemm_1x4c8__xop_ld128(benchmark::State& state, const char* net) {
603 GEMMBenchmark(state,
604 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
605 xnn_init_qu8_conv_minmax_fp32_sse2_params,
606 1, 4, 8, 1,
607 benchmark::utils::CheckXOP);
608 }
609 static void qu8_gemm_2x4c8__xop_ld128(benchmark::State& state, const char* net) {
610 GEMMBenchmark(state,
611 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
612 xnn_init_qu8_conv_minmax_fp32_sse2_params,
613 2, 4, 8, 1,
614 benchmark::utils::CheckXOP);
615 }
616 static void qu8_gemm_3x4c8__xop_ld128(benchmark::State& state, const char* net) {
617 GEMMBenchmark(state,
618 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
619 xnn_init_qu8_conv_minmax_fp32_sse2_params,
620 3, 4, 8, 1,
621 benchmark::utils::CheckXOP);
622 }
623 static void qu8_gemm_1x4c2__avx_ld64(benchmark::State& state, const char* net) {
624 GEMMBenchmark(state,
625 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
626 xnn_init_qu8_conv_minmax_fp32_sse2_params,
627 1, 4, 2, 1,
628 benchmark::utils::CheckAVX);
629 }
630 static void qu8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
631 GEMMBenchmark(state,
632 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
633 xnn_init_qu8_conv_minmax_fp32_sse2_params,
634 2, 4, 2, 1,
635 benchmark::utils::CheckAVX);
636 }
637 static void qu8_gemm_3x4c2__avx_ld64(benchmark::State& state, const char* net) {
638 GEMMBenchmark(state,
639 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
640 xnn_init_qu8_conv_minmax_fp32_sse2_params,
641 3, 4, 2, 1,
642 benchmark::utils::CheckAVX);
643 }
644 static void qu8_gemm_4x4c2__avx_ld64(benchmark::State& state, const char* net) {
645 GEMMBenchmark(state,
646 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
647 xnn_init_qu8_conv_minmax_fp32_sse2_params,
648 4, 4, 2, 1,
649 benchmark::utils::CheckAVX);
650 }
651 static void qu8_gemm_1x4c2__avx_ld128(benchmark::State& state, const char* net) {
652 GEMMBenchmark(state,
653 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
654 xnn_init_qu8_conv_minmax_fp32_sse2_params,
655 1, 4, 2, 1,
656 benchmark::utils::CheckAVX);
657 }
658 static void qu8_gemm_2x4c2__avx_ld128(benchmark::State& state, const char* net) {
659 GEMMBenchmark(state,
660 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
661 xnn_init_qu8_conv_minmax_fp32_sse2_params,
662 2, 4, 2, 1,
663 benchmark::utils::CheckAVX);
664 }
665 static void qu8_gemm_3x4c2__avx_ld128(benchmark::State& state, const char* net) {
666 GEMMBenchmark(state,
667 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
668 xnn_init_qu8_conv_minmax_fp32_sse2_params,
669 3, 4, 2, 1,
670 benchmark::utils::CheckAVX);
671 }
672 static void qu8_gemm_4x4c2__avx_ld128(benchmark::State& state, const char* net) {
673 GEMMBenchmark(state,
674 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
675 xnn_init_qu8_conv_minmax_fp32_sse2_params,
676 4, 4, 2, 1,
677 benchmark::utils::CheckAVX);
678 }
679 static void qu8_gemm_1x4c8__avx_ld64(benchmark::State& state, const char* net) {
680 GEMMBenchmark(state,
681 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
682 xnn_init_qu8_conv_minmax_fp32_sse2_params,
683 1, 4, 8, 1,
684 benchmark::utils::CheckAVX);
685 }
686 static void qu8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
687 GEMMBenchmark(state,
688 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
689 xnn_init_qu8_conv_minmax_fp32_sse2_params,
690 2, 4, 8, 1,
691 benchmark::utils::CheckAVX);
692 }
693 static void qu8_gemm_3x4c8__avx_ld64(benchmark::State& state, const char* net) {
694 GEMMBenchmark(state,
695 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
696 xnn_init_qu8_conv_minmax_fp32_sse2_params,
697 3, 4, 8, 1,
698 benchmark::utils::CheckAVX);
699 }
700 static void qu8_gemm_1x4c8__avx_ld128(benchmark::State& state, const char* net) {
701 GEMMBenchmark(state,
702 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
703 xnn_init_qu8_conv_minmax_fp32_sse2_params,
704 1, 4, 8, 1,
705 benchmark::utils::CheckAVX);
706 }
707 static void qu8_gemm_2x4c8__avx_ld128(benchmark::State& state, const char* net) {
708 GEMMBenchmark(state,
709 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
710 xnn_init_qu8_conv_minmax_fp32_sse2_params,
711 2, 4, 8, 1,
712 benchmark::utils::CheckAVX);
713 }
714 static void qu8_gemm_3x4c8__avx_ld128(benchmark::State& state, const char* net) {
715 GEMMBenchmark(state,
716 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
717 xnn_init_qu8_conv_minmax_fp32_sse2_params,
718 3, 4, 8, 1,
719 benchmark::utils::CheckAVX);
720 }
721 static void qu8_gemm_1x4c2__sse41_ld64(benchmark::State& state, const char* net) {
722 GEMMBenchmark(state,
723 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
724 xnn_init_qu8_conv_minmax_fp32_sse2_params,
725 1, 4, 2, 1,
726 benchmark::utils::CheckSSE41);
727 }
728 static void qu8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
729 GEMMBenchmark(state,
730 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
731 xnn_init_qu8_conv_minmax_fp32_sse2_params,
732 2, 4, 2, 1,
733 benchmark::utils::CheckSSE41);
734 }
735 static void qu8_gemm_3x4c2__sse41_ld64(benchmark::State& state, const char* net) {
736 GEMMBenchmark(state,
737 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
738 xnn_init_qu8_conv_minmax_fp32_sse2_params,
739 3, 4, 2, 1,
740 benchmark::utils::CheckSSE41);
741 }
742 static void qu8_gemm_4x4c2__sse41_ld64(benchmark::State& state, const char* net) {
743 GEMMBenchmark(state,
744 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
745 xnn_init_qu8_conv_minmax_fp32_sse2_params,
746 4, 4, 2, 1,
747 benchmark::utils::CheckSSE41);
748 }
749 static void qu8_gemm_1x4c2__sse41_ld128(benchmark::State& state, const char* net) {
750 GEMMBenchmark(state,
751 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
752 xnn_init_qu8_conv_minmax_fp32_sse2_params,
753 1, 4, 2, 1,
754 benchmark::utils::CheckSSE41);
755 }
756 static void qu8_gemm_2x4c2__sse41_ld128(benchmark::State& state, const char* net) {
757 GEMMBenchmark(state,
758 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
759 xnn_init_qu8_conv_minmax_fp32_sse2_params,
760 2, 4, 2, 1,
761 benchmark::utils::CheckSSE41);
762 }
763 static void qu8_gemm_3x4c2__sse41_ld128(benchmark::State& state, const char* net) {
764 GEMMBenchmark(state,
765 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
766 xnn_init_qu8_conv_minmax_fp32_sse2_params,
767 3, 4, 2, 1,
768 benchmark::utils::CheckSSE41);
769 }
770 static void qu8_gemm_4x4c2__sse41_ld128(benchmark::State& state, const char* net) {
771 GEMMBenchmark(state,
772 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
773 xnn_init_qu8_conv_minmax_fp32_sse2_params,
774 4, 4, 2, 1,
775 benchmark::utils::CheckSSE41);
776 }
777 static void qu8_gemm_1x4c8__sse41_ld64(benchmark::State& state, const char* net) {
778 GEMMBenchmark(state,
779 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
780 xnn_init_qu8_conv_minmax_fp32_sse2_params,
781 1, 4, 8, 1,
782 benchmark::utils::CheckSSE41);
783 }
784 static void qu8_gemm_2x4c8__sse41_ld64(benchmark::State& state, const char* net) {
785 GEMMBenchmark(state,
786 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
787 xnn_init_qu8_conv_minmax_fp32_sse2_params,
788 2, 4, 8, 1,
789 benchmark::utils::CheckSSE41);
790 }
791 static void qu8_gemm_3x4c8__sse41_ld64(benchmark::State& state, const char* net) {
792 GEMMBenchmark(state,
793 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
794 xnn_init_qu8_conv_minmax_fp32_sse2_params,
795 3, 4, 8, 1,
796 benchmark::utils::CheckSSE41);
797 }
798 static void qu8_gemm_1x4c8__sse41_ld128(benchmark::State& state, const char* net) {
799 GEMMBenchmark(state,
800 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
801 xnn_init_qu8_conv_minmax_fp32_sse2_params,
802 1, 4, 8, 1,
803 benchmark::utils::CheckSSE41);
804 }
805 static void qu8_gemm_2x4c8__sse41_ld128(benchmark::State& state, const char* net) {
806 GEMMBenchmark(state,
807 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
808 xnn_init_qu8_conv_minmax_fp32_sse2_params,
809 2, 4, 8, 1,
810 benchmark::utils::CheckSSE41);
811 }
812 static void qu8_gemm_3x4c8__sse41_ld128(benchmark::State& state, const char* net) {
813 GEMMBenchmark(state,
814 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
815 xnn_init_qu8_conv_minmax_fp32_sse2_params,
816 3, 4, 8, 1,
817 benchmark::utils::CheckSSE41);
818 }
819 static void qu8_gemm_1x4c2__sse2_ld64(benchmark::State& state, const char* net) {
820 GEMMBenchmark(state,
821 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
822 xnn_init_qu8_conv_minmax_fp32_sse2_params,
823 1, 4, 2, 1);
824 }
825 static void qu8_gemm_2x4c2__sse2_ld64(benchmark::State& state, const char* net) {
826 GEMMBenchmark(state,
827 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
828 xnn_init_qu8_conv_minmax_fp32_sse2_params,
829 2, 4, 2, 1);
830 }
831 static void qu8_gemm_3x4c2__sse2_ld64(benchmark::State& state, const char* net) {
832 GEMMBenchmark(state,
833 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
834 xnn_init_qu8_conv_minmax_fp32_sse2_params,
835 3, 4, 2, 1);
836 }
837 static void qu8_gemm_4x4c2__sse2_ld64(benchmark::State& state, const char* net) {
838 GEMMBenchmark(state,
839 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
840 xnn_init_qu8_conv_minmax_fp32_sse2_params,
841 4, 4, 2, 1);
842 }
843 static void qu8_gemm_1x4c2__sse2_ld128(benchmark::State& state, const char* net) {
844 GEMMBenchmark(state,
845 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
846 xnn_init_qu8_conv_minmax_fp32_sse2_params,
847 1, 4, 2, 1);
848 }
849 static void qu8_gemm_2x4c2__sse2_ld128(benchmark::State& state, const char* net) {
850 GEMMBenchmark(state,
851 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
852 xnn_init_qu8_conv_minmax_fp32_sse2_params,
853 2, 4, 2, 1);
854 }
855 static void qu8_gemm_3x4c2__sse2_ld128(benchmark::State& state, const char* net) {
856 GEMMBenchmark(state,
857 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
858 xnn_init_qu8_conv_minmax_fp32_sse2_params,
859 3, 4, 2, 1);
860 }
861 static void qu8_gemm_4x4c2__sse2_ld128(benchmark::State& state, const char* net) {
862 GEMMBenchmark(state,
863 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
864 xnn_init_qu8_conv_minmax_fp32_sse2_params,
865 4, 4, 2, 1);
866 }
867 static void qu8_gemm_1x4c8__sse2_ld64(benchmark::State& state, const char* net) {
868 GEMMBenchmark(state,
869 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
870 xnn_init_qu8_conv_minmax_fp32_sse2_params,
871 1, 4, 8, 1);
872 }
873 static void qu8_gemm_2x4c8__sse2_ld64(benchmark::State& state, const char* net) {
874 GEMMBenchmark(state,
875 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
876 xnn_init_qu8_conv_minmax_fp32_sse2_params,
877 2, 4, 8, 1);
878 }
879 static void qu8_gemm_3x4c8__sse2_ld64(benchmark::State& state, const char* net) {
880 GEMMBenchmark(state,
881 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
882 xnn_init_qu8_conv_minmax_fp32_sse2_params,
883 3, 4, 8, 1);
884 }
885 static void qu8_gemm_1x4c8__sse2_ld128(benchmark::State& state, const char* net) {
886 GEMMBenchmark(state,
887 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
888 xnn_init_qu8_conv_minmax_fp32_sse2_params,
889 1, 4, 8, 1);
890 }
891 static void qu8_gemm_2x4c8__sse2_ld128(benchmark::State& state, const char* net) {
892 GEMMBenchmark(state,
893 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
894 xnn_init_qu8_conv_minmax_fp32_sse2_params,
895 2, 4, 8, 1);
896 }
897 static void qu8_gemm_3x4c8__sse2_ld128(benchmark::State& state, const char* net) {
898 GEMMBenchmark(state,
899 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
900 xnn_init_qu8_conv_minmax_fp32_sse2_params,
901 3, 4, 8, 1);
902 }
903
904 BENCHMARK_GEMM(qu8_gemm_1x16c8__avx512skx)
905 BENCHMARK_GEMM(qu8_gemm_2x16c8__avx512skx)
906 BENCHMARK_GEMM(qu8_gemm_3x16c8__avx512skx)
907 BENCHMARK_GEMM(qu8_gemm_4x16c8__avx512skx)
908
909 BENCHMARK_GEMM(qu8_gemm_1x8c8__avx2)
910 BENCHMARK_GEMM(qu8_gemm_2x8c8__avx2)
911 BENCHMARK_GEMM(qu8_gemm_3x8c8__avx2)
912
913 BENCHMARK_GEMM(qu8_gemm_1x4c2__xop_ld64)
914 BENCHMARK_GEMM(qu8_gemm_2x4c2__xop_ld64)
915 BENCHMARK_GEMM(qu8_gemm_3x4c2__xop_ld64)
916 BENCHMARK_GEMM(qu8_gemm_4x4c2__xop_ld64)
917 BENCHMARK_GEMM(qu8_gemm_1x4c2__xop_ld128)
918 BENCHMARK_GEMM(qu8_gemm_2x4c2__xop_ld128)
919 BENCHMARK_GEMM(qu8_gemm_3x4c2__xop_ld128)
920 BENCHMARK_GEMM(qu8_gemm_4x4c2__xop_ld128)
921 BENCHMARK_GEMM(qu8_gemm_1x4c8__xop_ld64)
922 BENCHMARK_GEMM(qu8_gemm_2x4c8__xop_ld64)
923 BENCHMARK_GEMM(qu8_gemm_3x4c8__xop_ld64)
924 BENCHMARK_GEMM(qu8_gemm_1x4c8__xop_ld128)
925 BENCHMARK_GEMM(qu8_gemm_2x4c8__xop_ld128)
926 BENCHMARK_GEMM(qu8_gemm_3x4c8__xop_ld128)
927
928 BENCHMARK_GEMM(qu8_gemm_1x4c2__avx_ld64)
929 BENCHMARK_GEMM(qu8_gemm_2x4c2__avx_ld64)
930 BENCHMARK_GEMM(qu8_gemm_3x4c2__avx_ld64)
931 BENCHMARK_GEMM(qu8_gemm_4x4c2__avx_ld64)
932 BENCHMARK_GEMM(qu8_gemm_1x4c2__avx_ld128)
933 BENCHMARK_GEMM(qu8_gemm_2x4c2__avx_ld128)
934 BENCHMARK_GEMM(qu8_gemm_3x4c2__avx_ld128)
935 BENCHMARK_GEMM(qu8_gemm_4x4c2__avx_ld128)
936 BENCHMARK_GEMM(qu8_gemm_1x4c8__avx_ld64)
937 BENCHMARK_GEMM(qu8_gemm_2x4c8__avx_ld64)
938 BENCHMARK_GEMM(qu8_gemm_3x4c8__avx_ld64)
939 BENCHMARK_GEMM(qu8_gemm_1x4c8__avx_ld128)
940 BENCHMARK_GEMM(qu8_gemm_2x4c8__avx_ld128)
941 BENCHMARK_GEMM(qu8_gemm_3x4c8__avx_ld128)
942
943 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse41_ld64)
944 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse41_ld64)
945 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse41_ld64)
946 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse41_ld64)
947 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse41_ld128)
948 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse41_ld128)
949 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse41_ld128)
950 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse41_ld128)
951 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse41_ld64)
952 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse41_ld64)
953 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse41_ld64)
954 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse41_ld128)
955 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse41_ld128)
956 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse41_ld128)
957
958 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse2_ld64)
959 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse2_ld64)
960 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse2_ld64)
961 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2_ld64)
962 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse2_ld128)
963 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse2_ld128)
964 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse2_ld128)
965 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2_ld128)
966 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse2_ld64)
967 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2_ld64)
968 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse2_ld64)
969 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse2_ld128)
970 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2_ld128)
971 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse2_ld128)
Marat Dukhan725f47e2021-05-22 10:06:19 -0700972#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
973
974
Marat Dukhand8e2d712021-07-26 23:35:50 -0700975#if XNN_ARCH_WASMSIMD
Marat Dukhandfc2db02021-08-08 21:19:07 -0700976 static void qu8_gemm_1x4c8__wasmsimd_mul32_ld64(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -0700977 GEMMBenchmark(state,
Marat Dukhandfc2db02021-08-08 21:19:07 -0700978 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64,
Marat Dukhand8e2d712021-07-26 23:35:50 -0700979 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
980 1, 4, 8, 1);
981 }
Marat Dukhandfc2db02021-08-08 21:19:07 -0700982 static void qu8_gemm_2x4c8__wasmsimd_mul32_ld64(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -0700983 GEMMBenchmark(state,
Marat Dukhandfc2db02021-08-08 21:19:07 -0700984 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64,
Marat Dukhand8e2d712021-07-26 23:35:50 -0700985 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
986 2, 4, 8, 1);
987 }
Marat Dukhandfc2db02021-08-08 21:19:07 -0700988 static void qu8_gemm_3x4c8__wasmsimd_mul32_ld64(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -0700989 GEMMBenchmark(state,
Marat Dukhandfc2db02021-08-08 21:19:07 -0700990 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64,
Marat Dukhand8e2d712021-07-26 23:35:50 -0700991 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
992 3, 4, 8, 1);
993 }
Marat Dukhandfc2db02021-08-08 21:19:07 -0700994 static void qu8_gemm_1x4c8__wasmsimd_mul32_ld128(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -0700995 GEMMBenchmark(state,
Marat Dukhandfc2db02021-08-08 21:19:07 -0700996 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128,
Marat Dukhand8e2d712021-07-26 23:35:50 -0700997 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
998 1, 4, 8, 1);
999 }
Marat Dukhandfc2db02021-08-08 21:19:07 -07001000 static void qu8_gemm_2x4c8__wasmsimd_mul32_ld128(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001001 GEMMBenchmark(state,
Marat Dukhandfc2db02021-08-08 21:19:07 -07001002 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001003 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1004 2, 4, 8, 1);
1005 }
Marat Dukhandfc2db02021-08-08 21:19:07 -07001006 static void qu8_gemm_3x4c8__wasmsimd_mul32_ld128(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001007 GEMMBenchmark(state,
Marat Dukhandfc2db02021-08-08 21:19:07 -07001008 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001009 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1010 3, 4, 8, 1);
1011 }
1012
Marat Dukhandfc2db02021-08-08 21:19:07 -07001013 BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_mul32_ld64)
1014 BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_mul32_ld64)
1015 BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_mul32_ld64)
1016 BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_mul32_ld128)
1017 BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_mul32_ld128)
1018 BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_mul32_ld128)
Marat Dukhand8e2d712021-07-26 23:35:50 -07001019#endif // XNN_ARCH_WASMSIMD
1020
1021
Frank Barcharde2163bc2021-08-02 12:30:09 -07001022static void qu8_gemm_1x2__scalar_lrint(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001023 GEMMBenchmark(state,
1024 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrint,
1025 xnn_init_qu8_conv_minmax_fp32_scalar_lrint_params,
1026 1, 2, 1, 1);
1027}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001028static void qu8_gemm_2x2__scalar_lrint(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001029 GEMMBenchmark(state,
1030 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_lrint,
1031 xnn_init_qu8_conv_minmax_fp32_scalar_lrint_params,
1032 2, 2, 1, 1);
1033}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001034static void qu8_gemm_3x2__scalar_lrint(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001035 GEMMBenchmark(state,
1036 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrint,
1037 xnn_init_qu8_conv_minmax_fp32_scalar_lrint_params,
1038 3, 2, 1, 1);
1039}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001040static void qu8_gemm_4x2__scalar_lrint(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001041 GEMMBenchmark(state,
1042 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrint,
1043 xnn_init_qu8_conv_minmax_fp32_scalar_lrint_params,
1044 4, 2, 1, 1);
1045}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001046static void qu8_gemm_1x4__scalar_lrint(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001047 GEMMBenchmark(state,
1048 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrint,
1049 xnn_init_qu8_conv_minmax_fp32_scalar_lrint_params,
1050 1, 4, 1, 1);
1051}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001052static void qu8_gemm_2x4__scalar_lrint(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001053 GEMMBenchmark(state,
1054 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_lrint,
1055 xnn_init_qu8_conv_minmax_fp32_scalar_lrint_params,
1056 2, 4, 1, 1);
1057}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001058static void qu8_gemm_3x4__scalar_lrint(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001059 GEMMBenchmark(state,
1060 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrint,
1061 xnn_init_qu8_conv_minmax_fp32_scalar_lrint_params,
1062 3, 4, 1, 1);
1063}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001064static void qu8_gemm_4x4__scalar_lrint(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001065 GEMMBenchmark(state,
1066 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrint,
1067 xnn_init_qu8_conv_minmax_fp32_scalar_lrint_params,
1068 4, 4, 1, 1);
1069}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001070static void qu8_gemm_1x2__scalar_magic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001071 GEMMBenchmark(state,
1072 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_magic,
1073 xnn_init_qu8_conv_minmax_fp32_scalar_magic_params,
1074 1, 2, 1, 1);
1075}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001076static void qu8_gemm_2x2__scalar_magic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001077 GEMMBenchmark(state,
1078 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_magic,
1079 xnn_init_qu8_conv_minmax_fp32_scalar_magic_params,
1080 2, 2, 1, 1);
1081}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001082static void qu8_gemm_3x2__scalar_magic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001083 GEMMBenchmark(state,
1084 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_magic,
1085 xnn_init_qu8_conv_minmax_fp32_scalar_magic_params,
1086 3, 2, 1, 1);
1087}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001088static void qu8_gemm_4x2__scalar_magic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001089 GEMMBenchmark(state,
1090 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_magic,
1091 xnn_init_qu8_conv_minmax_fp32_scalar_magic_params,
1092 4, 2, 1, 1);
1093}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001094static void qu8_gemm_1x4__scalar_magic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001095 GEMMBenchmark(state,
1096 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_magic,
1097 xnn_init_qu8_conv_minmax_fp32_scalar_magic_params,
1098 1, 4, 1, 1);
1099}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001100static void qu8_gemm_2x4__scalar_magic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001101 GEMMBenchmark(state,
1102 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_magic,
1103 xnn_init_qu8_conv_minmax_fp32_scalar_magic_params,
1104 2, 4, 1, 1);
1105}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001106static void qu8_gemm_3x4__scalar_magic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001107 GEMMBenchmark(state,
1108 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_magic,
1109 xnn_init_qu8_conv_minmax_fp32_scalar_magic_params,
1110 3, 4, 1, 1);
1111}
Frank Barcharde2163bc2021-08-02 12:30:09 -07001112static void qu8_gemm_4x4__scalar_magic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001113 GEMMBenchmark(state,
1114 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_magic,
1115 xnn_init_qu8_conv_minmax_fp32_scalar_magic_params,
1116 4, 4, 1, 1);
1117}
1118
Frank Barcharde2163bc2021-08-02 12:30:09 -07001119BENCHMARK_GEMM(qu8_gemm_1x2__scalar_lrint)
1120BENCHMARK_GEMM(qu8_gemm_2x2__scalar_lrint)
1121BENCHMARK_GEMM(qu8_gemm_3x2__scalar_lrint)
1122BENCHMARK_GEMM(qu8_gemm_4x2__scalar_lrint)
1123BENCHMARK_GEMM(qu8_gemm_1x4__scalar_lrint)
1124BENCHMARK_GEMM(qu8_gemm_2x4__scalar_lrint)
1125BENCHMARK_GEMM(qu8_gemm_3x4__scalar_lrint)
1126BENCHMARK_GEMM(qu8_gemm_4x4__scalar_lrint)
Marat Dukhand8e2d712021-07-26 23:35:50 -07001127
Frank Barcharde2163bc2021-08-02 12:30:09 -07001128BENCHMARK_GEMM(qu8_gemm_1x2__scalar_magic)
1129BENCHMARK_GEMM(qu8_gemm_2x2__scalar_magic)
1130BENCHMARK_GEMM(qu8_gemm_3x2__scalar_magic)
1131BENCHMARK_GEMM(qu8_gemm_4x2__scalar_magic)
1132BENCHMARK_GEMM(qu8_gemm_1x4__scalar_magic)
1133BENCHMARK_GEMM(qu8_gemm_2x4__scalar_magic)
1134BENCHMARK_GEMM(qu8_gemm_3x4__scalar_magic)
1135BENCHMARK_GEMM(qu8_gemm_4x4__scalar_magic)
Marat Dukhand8e2d712021-07-26 23:35:50 -07001136
1137
Marat Dukhan33f0c7a2019-10-01 13:33:08 -07001138#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001139BENCHMARK_GEMM(ruy_st)
Marat Dukhan33f0c7a2019-10-01 13:33:08 -07001140#endif // BENCHMARK_RUY
1141#ifdef BENCHMARK_GEMMLOWP
1142BENCHMARK_GEMM(gemmlowp_st)
1143#endif // BENCHMARK_GEMMLOWP
XNNPACK Teamb455b122019-09-27 18:10:33 -07001144
1145#ifndef XNNPACK_BENCHMARK_NO_MAIN
1146BENCHMARK_MAIN();
1147#endif