blob: 2a25313eb7123f3005bc60a168a91fc0e0746505 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <chrono>
12#include <cmath>
13#include <functional>
Marat Dukhan5ce30d92020-04-14 03:31:26 -070014#include <limits>
XNNPACK Teamb455b122019-09-27 18:10:33 -070015#include <mutex>
16#include <random>
17#include <vector>
18
Frank Barchardfa4daf02021-10-27 13:31:04 -070019#include <cpuinfo.h>
20
XNNPACK Teamb455b122019-09-27 18:10:33 -070021#include <benchmark/benchmark.h>
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070022#ifdef BENCHMARK_GEMMLOWP
23#include "gemmlowp/public/gemmlowp.h"
24#endif // BENCHMARK_GEMMLOWP
25#ifdef BENCHMARK_RUY
Benoit Jacobb038fdc2020-03-25 12:14:20 -070026#include "ruy/ruy.h"
Marat Dukhan33f0c7a2019-10-01 13:33:08 -070027#endif // BENCHMARK_RUY
Frank Barchardbb4c18b2019-09-30 11:05:52 -070028#include "bench/gemm.h"
29#include "bench/utils.h"
30#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070031#include <xnnpack/common.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070032#include <xnnpack/gemm.h>
33#include <xnnpack/pack.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070034#include <xnnpack/params-init.h>
Frank Barchardbb4c18b2019-09-30 11:05:52 -070035#include <xnnpack/params.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070036
37
38static void GEMMBenchmark(benchmark::State& state,
Marat Dukhane3d17bf2021-05-24 22:22:43 -070039 xnn_qu8_gemm_minmax_ukernel_function gemm,
Marat Dukhane3d17bf2021-05-24 22:22:43 -070040 xnn_init_qu8_conv_minmax_params_fn init_params,
Marat Dukhand8e2d712021-07-26 23:35:50 -070041 size_t mr, size_t nr, size_t kr, size_t sr,
Marat Dukhan4e895872020-12-04 15:27:45 -080042 benchmark::utils::IsaCheckFunction isa_check = nullptr)
XNNPACK Teamb455b122019-09-27 18:10:33 -070043{
Frank Barchardfa4daf02021-10-27 13:31:04 -070044 if (!cpuinfo_initialize()) {
45 state.SkipWithError("cpuinfo initialization failed");
46 return;
47 }
48 if (isa_check && !isa_check(state)) {
49 return;
50 }
51
XNNPACK Teamb455b122019-09-27 18:10:33 -070052 const size_t mc = state.range(0);
53 const size_t nc = state.range(1);
54 const size_t kc = state.range(2);
55
Marat Dukhan42323232019-10-23 02:09:02 -070056 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
Marat Dukhanfbd67a72022-01-31 18:03:50 -080057 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr * sr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070058
59 std::random_device random_device;
60 auto rng = std::mt19937(random_device());
Marat Dukhanecd83112020-08-03 21:50:28 -070061 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
Marat Dukhan44f0ca72020-08-02 21:46:58 -070062 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -070063
Marat Dukhan91351ef2021-08-04 16:32:28 -070064 std::vector<uint8_t> a(mc * kc + XNN_EXTRA_BYTES / sizeof(uint8_t));
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 std::generate(a.begin(), a.end(), std::ref(u8rng));
66 std::vector<uint8_t> k(nc * kc);
67 std::generate(k.begin(), k.end(), std::ref(u8rng));
68 std::vector<int32_t> b(nc);
Marat Dukhanecd83112020-08-03 21:50:28 -070069 std::generate(b.begin(), b.end(), std::ref(i32rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -070070
71 const size_t w_elements = kc_stride * nc_stride + nc_stride * sizeof(int32_t) / sizeof(uint8_t);
72 const size_t c_elements = mc * nc;
73 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070074 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070075 sizeof(uint8_t) * (w_elements + c_elements));
76
Marat Dukhane13e6392021-07-26 22:22:35 -070077 std::vector<uint8_t, AlignedAllocator<uint8_t, 64>> w(w_elements * num_buffers);
XNNPACK Teamb455b122019-09-27 18:10:33 -070078 std::fill(w.begin(), w.end(), 0);
Marat Dukhan08b7a972020-07-14 18:17:29 -070079 const xnn_qu8_packing_params packing_params = { 127, 127 };
Marat Dukhan0b043742021-06-02 18:29:11 -070080 xnn_pack_qu8_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), 0, &packing_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -070081 std::vector<uint8_t> c(c_elements * num_buffers);
82 std::fill(c.begin(), c.end(), 0xA5);
83
Marat Dukhane3d17bf2021-05-24 22:22:43 -070084 union xnn_qu8_conv_minmax_params quantization_params;
Marat Dukhan725f47e2021-05-22 10:06:19 -070085 init_params(&quantization_params, 127, 0.75f, 127, 1, 254);
XNNPACK Teamb455b122019-09-27 18:10:33 -070086
87 size_t buffer_index = 0;
88 for (auto _ : state) {
89 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
90 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
91 // - W is not in cache (for any cache level)
92 // - C is not in cache (for any cache level)
93 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -070094 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Teamb455b122019-09-27 18:10:33 -070095 buffer_index = (buffer_index + 1) % num_buffers;
96 state.ResumeTiming();
97
98 for (uint32_t m = 0; m < mc; m += mr) {
99 const uint32_t mb = min(mc - m, mr);
100 for (uint32_t n = 0; n < nc; n += nr) {
101 const uint32_t nb = min(nc - n, nr);
Marat Dukhan08b7a972020-07-14 18:17:29 -0700102 gemm(
Marat Dukhanb1864632019-11-25 16:34:17 -0800103 mb, nb, kc * sizeof(uint8_t),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700104 a.data() + m * kc, kc * sizeof(uint8_t),
105 w.data() + (w_elements * buffer_index + n * (kc_stride + sizeof(int32_t))) / sizeof(uint8_t),
106 c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint8_t), nr * sizeof(uint8_t),
Marat Dukhanb42f8662020-07-06 20:46:13 -0700107 &quantization_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700108 }
109 }
110 }
111
Marat Dukhand713e8a2020-12-04 14:23:12 -0800112 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
113 if (cpu_frequency != 0) {
114 state.counters["cpufreq"] = cpu_frequency;
115 }
116
XNNPACK Teamb455b122019-09-27 18:10:33 -0700117 state.counters["OPS"] = benchmark::Counter(
118 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
119}
120
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700121#ifdef BENCHMARK_GEMMLOWP
XNNPACK Teamb455b122019-09-27 18:10:33 -0700122struct GemmlowpOutputPipeline {
123 typedef gemmlowp::VectorMap<const int32_t, gemmlowp::VectorShape::Col> ColVectorMap;
124 typedef std::tuple<
125 gemmlowp::OutputStageBiasAddition<ColVectorMap>,
126 gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint,
127 gemmlowp::OutputStageClamp,
128 gemmlowp::OutputStageSaturatingCastToUint8>
129 Pipeline;
130
131 static Pipeline Make(
132 const int32_t* bias_data,
133 int output_rows,
134 int32_t output_offset,
135 int32_t output_multiplier,
136 int output_shift,
137 int32_t output_activation_min,
138 int32_t output_activation_max)
139 {
140 ColVectorMap bias_vector(bias_data, output_rows);
141 gemmlowp::OutputStageBiasAddition<ColVectorMap> bias_addition_stage;
142 bias_addition_stage.bias_vector = bias_vector;
143 gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint quantize_down_stage;
144 quantize_down_stage.result_offset_after_shift = output_offset;
145 quantize_down_stage.result_fixedpoint_multiplier = output_multiplier;
146 quantize_down_stage.result_shift = output_shift;
147 gemmlowp::OutputStageClamp clamp_stage;
148 clamp_stage.min = output_activation_min;
149 clamp_stage.max = output_activation_max;
150 gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage;
151 return std::make_tuple(bias_addition_stage, quantize_down_stage, clamp_stage, saturating_cast_stage);
152 }
153};
154
155static void GemmlowpBenchmark(benchmark::State& state, uint32_t threads)
156{
XNNPACK Teamb455b122019-09-27 18:10:33 -0700157 const size_t mc = state.range(0);
158 const size_t nc = state.range(1);
159 const size_t kc = state.range(2);
160
161 std::random_device random_device;
162 auto rng = std::mt19937(random_device());
Marat Dukhanecd83112020-08-03 21:50:28 -0700163 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
Marat Dukhan44f0ca72020-08-02 21:46:58 -0700164 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700165
166 std::vector<uint8_t> a(mc * kc);
167 std::generate(a.begin(), a.end(), std::ref(u8rng));
168
169 const size_t kElements = nc * kc;
170 const size_t bElements = nc;
171 const size_t c_elements = mc * nc;
172 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700173 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700174 kElements * sizeof(uint8_t) + bElements * sizeof(int32_t) + c_elements * sizeof(uint8_t));
175
176 std::vector<uint8_t> k(kElements * num_buffers);
177 std::generate(k.begin(), k.end(), std::ref(u8rng));
178 std::vector<int32_t> b(bElements * num_buffers);
Marat Dukhanecd83112020-08-03 21:50:28 -0700179 std::generate(b.begin(), b.end(), std::ref(i32rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700180 std::vector<uint8_t> c(c_elements * num_buffers);
181 std::fill(c.begin(), c.end(), 0xA5);
182
183 gemmlowp::MultiThreadGemmContext threadingContext;
184 threadingContext.set_max_num_threads(threads);
185
186 size_t buffer_index = 0;
187 for (auto _ : state) {
188 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700189 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700190 buffer_index = (buffer_index + 1) % num_buffers;
191 state.ResumeTiming();
192
193 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::RowMajor> AM(a.data(), mc, kc, kc);
194 gemmlowp::MatrixMap<const uint8_t, gemmlowp::MapOrder::ColMajor> BM(k.data() + buffer_index * kElements, kc, nc, kc);
195 gemmlowp::MatrixMap<uint8_t, gemmlowp::MapOrder::RowMajor> CM(c.data() + buffer_index * c_elements, mc, nc, nc);
196 const auto& outputPipeline = GemmlowpOutputPipeline::Make(b.data() + buffer_index * bElements, nc, 127, 127, 127, 0, 255);
197 gemmlowp::GemmWithOutputPipeline<uint8_t, uint8_t, gemmlowp::L8R8WithLhsNonzeroBitDepthParams>(
198 &threadingContext, AM, BM, &CM, 127, 127, outputPipeline);
199 }
200
Marat Dukhand713e8a2020-12-04 14:23:12 -0800201 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
202 if (cpu_frequency != 0) {
203 state.counters["cpufreq"] = cpu_frequency;
204 }
205
XNNPACK Teamb455b122019-09-27 18:10:33 -0700206 state.counters["OPS"] = benchmark::Counter(
207 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
208}
209
210static void gemmlowp_st(benchmark::State& state, const char* net)
211{
212 GemmlowpBenchmark(state, 1);
213}
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700214#endif // BENCHMARK_GEMMLOWP
XNNPACK Teamb455b122019-09-27 18:10:33 -0700215
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700216
217#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700218static void RuyBenchmark(benchmark::State& state, size_t threads)
219{
220 const size_t mc = state.range(0);
221 const size_t nc = state.range(1);
222 const size_t kc = state.range(2);
223
224 std::random_device random_device;
225 auto rng = std::mt19937(random_device());
Marat Dukhanecd83112020-08-03 21:50:28 -0700226 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
Marat Dukhan44f0ca72020-08-02 21:46:58 -0700227 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700228
229 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -0700230 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700231 nc * (sizeof(uint8_t) * (mc + kc) + sizeof(int32_t)));
232
233 std::vector<uint8_t> a(mc * kc);
234 std::generate(a.begin(), a.end(), std::ref(u8rng));
235 std::vector<uint8_t> k(num_buffers * nc * kc);
236 std::generate(k.begin(), k.end(), std::ref(u8rng));
237 std::vector<int32_t> b(num_buffers * nc);
Marat Dukhanecd83112020-08-03 21:50:28 -0700238 std::generate(b.begin(), b.end(), std::ref(i32rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700239 std::vector<uint8_t> c(num_buffers * nc * mc);
240 std::fill(c.begin(), c.end(), std::nanf(""));
241
242 // Note: context must be static to avoid the cost of re-creating it for each benchmark.
243 static ruy::Context context;
Benoit Jacob349701a2020-04-15 19:35:24 -0700244 context.set_max_num_threads(threads);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700245
246 ruy::Matrix<uint8_t> ruy_a;
Benoit Jacob349701a2020-04-15 19:35:24 -0700247 ruy::MakeSimpleLayout(nc, kc, ruy::Order::kRowMajor, ruy_a.mutable_layout());
248 ruy_a.set_zero_point(127);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700249 ruy::Matrix<uint8_t> ruy_b;
Benoit Jacob349701a2020-04-15 19:35:24 -0700250 ruy::MakeSimpleLayout(kc, mc, ruy::Order::kColMajor, ruy_b.mutable_layout());
251 ruy_b.set_data(a.data());
252 ruy_b.set_zero_point(127);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700253 ruy::Matrix<uint8_t> ruy_c;
Benoit Jacob349701a2020-04-15 19:35:24 -0700254 ruy::MakeSimpleLayout(nc, mc, ruy::Order::kColMajor, ruy_c.mutable_layout());
255 ruy_c.set_zero_point(127);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700256
Benoit Jacobb026e222020-04-16 12:30:03 -0700257 ruy::MulParams<int32_t, uint8_t> mul_params;
258 mul_params.set_multiplier_fixedpoint(0x40000000);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700259
260 // ruy::Context uses deferred initialization, which affects percieved GEMM performance. Initialization happens during
261 // the first GEMM calls, and per Benoit Jacob it takes up to ~250 milliseconds for performance to stabilize.
262 // Thus, on the first benchmark, we compute GEMM for 500 milliseconds (to be safe) without recording performance, and
263 // keep the ruy::Context object initialized (by being static) between subsequent benchmarks.
264 static std::once_flag warmup;
265 std::call_once(warmup, [&](){
266 auto start = std::chrono::steady_clock::now();
267 do {
Benoit Jacob349701a2020-04-15 19:35:24 -0700268 ruy_a.set_data(k.data());
269 ruy_c.set_data(c.data());
Benoit Jacobb026e222020-04-16 12:30:03 -0700270 mul_params.set_bias(b.data());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700271
Benoit Jacobb026e222020-04-16 12:30:03 -0700272 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700273 } while (std::chrono::duration<double>(std::chrono::steady_clock::now() - start).count() < 0.5);
274 });
275
276 size_t buffer_index = 0;
277 for (auto _ : state) {
278 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
279 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
280 // - K is not in cache (for any cache level)
281 // - B is not in cache (for any cache level)
282 // - C is not in cache (for any cache level)
283 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -0700284 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint8_t));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700285 buffer_index = (buffer_index + 1) % num_buffers;
286 state.ResumeTiming();
287
Benoit Jacob349701a2020-04-15 19:35:24 -0700288 ruy_a.set_data(k.data() + buffer_index * nc * kc);
289 ruy_c.set_data(c.data() + buffer_index * mc * nc);
Benoit Jacobb026e222020-04-16 12:30:03 -0700290 mul_params.set_bias(b.data() + buffer_index * nc);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700291
Benoit Jacobb026e222020-04-16 12:30:03 -0700292 ruy::Mul(ruy_a, ruy_b, mul_params, &context, &ruy_c);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700293 }
294
Marat Dukhand713e8a2020-12-04 14:23:12 -0800295 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
296 if (cpu_frequency != 0) {
297 state.counters["cpufreq"] = cpu_frequency;
298 }
299
XNNPACK Teamb455b122019-09-27 18:10:33 -0700300 state.counters["OPS"] = benchmark::Counter(
301 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
302}
303
304static void ruy_st(benchmark::State& state, const char* net)
305{
306 RuyBenchmark(state, 1);
307}
Marat Dukhan33f0c7a2019-10-01 13:33:08 -0700308#endif // BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700309
310
Frank Barchard901845c2022-01-19 01:45:22 -0800311#if XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
Frank Barchardf82410d2022-02-02 01:01:38 -0800312 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
313 GEMMBenchmark(state,
314 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a53,
315 xnn_init_qu8_conv_minmax_rndnu_neon_params,
316 4, 8, 1, 1, benchmark::utils::CheckNEON);
317 }
318 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
319 GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53,
320 xnn_init_qu8_conv_minmax_rndnu_neon_params,
321 4, 8, 1, 1, benchmark::utils::CheckNEON);
322 }
Frank Barchard34251d82022-02-02 11:57:11 -0800323 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7(benchmark::State& state, const char* net) {
324 GEMMBenchmark(state,
325 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_cortex_a7,
326 xnn_init_qu8_conv_minmax_rndnu_neon_params,
327 4, 8, 1, 1, benchmark::utils::CheckNEON);
328 }
329 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7(benchmark::State& state, const char* net) {
330 GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7,
331 xnn_init_qu8_conv_minmax_rndnu_neon_params,
332 4, 8, 1, 1, benchmark::utils::CheckNEON);
333 }
Frank Barchard901845c2022-01-19 01:45:22 -0800334 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
335 GEMMBenchmark(state,
336 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_ld64,
337 xnn_init_qu8_conv_minmax_rndnu_neon_params,
338 4, 8, 1, 1, benchmark::utils::CheckNEON);
339 }
340 static void qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
341 GEMMBenchmark(state, xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__aarch32_neon_mlal_lane_prfm_ld64,
342 xnn_init_qu8_conv_minmax_rndnu_neon_params,
343 4, 8, 1, 1, benchmark::utils::CheckNEON);
344 }
Frank Barchardf82410d2022-02-02 01:01:38 -0800345 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a53)
346 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a53)
Frank Barchard34251d82022-02-02 11:57:11 -0800347 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_cortex_a7)
348 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_cortex_a7)
Frank Barchard901845c2022-01-19 01:45:22 -0800349 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_ld64)
350 BENCHMARK_GEMM(qu8_gemm_4x8__aarch32_neon_mlal_lane_prfm_ld64)
351#endif // XNN_ARCH_ARM && XNN_ENABLE_ASSEMBLY
352
Frank Barchard4c3e5a92021-08-16 19:17:39 -0700353#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchardefc3ccf2021-08-31 23:20:00 -0700354 static void qu8_gemm_4x16c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
Frank Barchardca4c68e2021-08-25 19:06:40 -0700355 GEMMBenchmark(state,
Frank Barchardefc3ccf2021-08-31 23:20:00 -0700356 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_cortex_a55,
Frank Barchardca4c68e2021-08-25 19:06:40 -0700357 xnn_init_qu8_conv_minmax_rndnu_neon_params,
Frank Barchardefc3ccf2021-08-31 23:20:00 -0700358 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchardca4c68e2021-08-25 19:06:40 -0700359 }
Frank Barchard40668982021-08-24 11:12:04 -0700360 static void qu8_gemm_4x16c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
361 GEMMBenchmark(state,
362 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__aarch64_neondot_ld128,
363 xnn_init_qu8_conv_minmax_rndnu_neon_params,
364 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
365 }
Frank Barcharddf8e6042021-09-03 13:56:29 -0700366 static void qu8_gemm_4x8c4__aarch64_neondot_ld128(benchmark::State& state, const char* net) {
Frank Barchard0049e892021-08-22 09:37:21 -0700367 GEMMBenchmark(state,
Frank Barcharddf8e6042021-09-03 13:56:29 -0700368 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_ld128,
Frank Barchard0049e892021-08-22 09:37:21 -0700369 xnn_init_qu8_conv_minmax_rndnu_neon_params,
370 4, 8, 4, 1,
371 benchmark::utils::CheckNEONDOT);
372 }
Frank Barchardefc3ccf2021-08-31 23:20:00 -0700373 static void qu8_gemm_4x8c4__aarch64_neondot_cortex_a55(benchmark::State& state, const char* net) {
374 GEMMBenchmark(state,
375 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__aarch64_neondot_cortex_a55,
376 xnn_init_qu8_conv_minmax_rndnu_neon_params,
377 4, 8, 4, 1, benchmark::utils::CheckNEONDOT);
378 }
Frank Barchard4c3e5a92021-08-16 19:17:39 -0700379 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53(benchmark::State& state, const char* net) {
380 GEMMBenchmark(state,
381 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a53,
382 xnn_init_qu8_conv_minmax_rndnu_neon_params,
383 4, 16, 1, 1,
384 benchmark::utils::CheckNEON);
385 }
386 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53(benchmark::State& state, const char* net) {
387 GEMMBenchmark(state,
388 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53,
389 xnn_init_qu8_conv_minmax_rndnu_neon_params,
390 4, 16, 1, 1,
391 benchmark::utils::CheckNEON);
392 }
Frank Barchard9cdc10d2021-11-22 19:03:54 -0800393 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64(benchmark::State& state, const char* net) {
394 GEMMBenchmark(state,
395 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_ld64,
396 xnn_init_qu8_conv_minmax_rndnu_neon_params,
397 4, 16, 1, 1,
398 benchmark::utils::CheckNEON);
399 }
400 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64(benchmark::State& state, const char* net) {
401 GEMMBenchmark(state,
402 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_ld64,
403 xnn_init_qu8_conv_minmax_rndnu_neon_params,
404 4, 16, 1, 1,
405 benchmark::utils::CheckNEON);
406 }
Frank Barchard4c3e5a92021-08-16 19:17:39 -0700407 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75(benchmark::State& state, const char* net) {
408 GEMMBenchmark(state,
409 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_cortex_a75,
410 xnn_init_qu8_conv_minmax_rndnu_neon_params,
411 4, 16, 1, 1,
412 benchmark::utils::CheckNEON);
413 }
414 static void qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75(benchmark::State& state, const char* net) {
415 GEMMBenchmark(state,
416 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75,
417 xnn_init_qu8_conv_minmax_rndnu_neon_params,
418 4, 16, 1, 1,
419 benchmark::utils::CheckNEON);
420 }
Frank Barchardefc3ccf2021-08-31 23:20:00 -0700421 BENCHMARK_GEMM(qu8_gemm_4x8c4__aarch64_neondot_cortex_a55)
Frank Barcharddf8e6042021-09-03 13:56:29 -0700422 BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_cortex_a55)
423 BENCHMARK_GEMM(qu8_gemm_4x8c4__aarch64_neondot_ld128)
424 BENCHMARK_GEMM(qu8_gemm_4x16c4__aarch64_neondot_ld128)
Frank Barchard4c3e5a92021-08-16 19:17:39 -0700425 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a53)
426 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a53)
Frank Barchard9cdc10d2021-11-22 19:03:54 -0800427 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_ld64)
428 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_ld64)
Frank Barchard4c3e5a92021-08-16 19:17:39 -0700429 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_cortex_a75)
430 BENCHMARK_GEMM(qu8_gemm_4x16__aarch64_neon_mlal_lane_prfm_cortex_a75)
431#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
432
Frank Barchard0049e892021-08-22 09:37:21 -0700433
Marat Dukhand8e2d712021-07-26 23:35:50 -0700434#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard88e839c2021-08-11 00:12:31 -0700435 static void qu8_gemm_1x8c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700436 GEMMBenchmark(state,
437 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8c4__neondot,
438 xnn_init_qu8_conv_minmax_rndnu_neon_params,
439 1, 8, 4, 1, benchmark::utils::CheckNEONDOT);
440 }
441 static void qu8_gemm_2x8c4__neondot(benchmark::State& state, const char* net) {
442 GEMMBenchmark(state,
443 xnn_qu8_gemm_minmax_rndnu_ukernel_2x8c4__neondot,
444 xnn_init_qu8_conv_minmax_rndnu_neon_params,
445 2, 8, 4, 1, benchmark::utils::CheckNEONDOT);
446 }
447 static void qu8_gemm_3x8c4__neondot(benchmark::State& state, const char* net) {
448 GEMMBenchmark(state,
449 xnn_qu8_gemm_minmax_rndnu_ukernel_3x8c4__neondot,
450 xnn_init_qu8_conv_minmax_rndnu_neon_params,
451 3, 8, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700452 }
453 static void qu8_gemm_4x8c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700454 GEMMBenchmark(state,
455 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8c4__neondot,
456 xnn_init_qu8_conv_minmax_rndnu_neon_params,
457 4, 8, 4, 1, benchmark::utils::CheckNEONDOT);
458 }
459 static void qu8_gemm_5x8c4__neondot(benchmark::State& state, const char* net) {
460 GEMMBenchmark(state,
461 xnn_qu8_gemm_minmax_rndnu_ukernel_5x8c4__neondot,
462 xnn_init_qu8_conv_minmax_rndnu_neon_params,
463 5, 8, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700464 }
465 static void qu8_gemm_6x8c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700466 GEMMBenchmark(state,
467 xnn_qu8_gemm_minmax_rndnu_ukernel_6x8c4__neondot,
468 xnn_init_qu8_conv_minmax_rndnu_neon_params,
469 6, 8, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700470 }
471 static void qu8_gemm_8x8c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700472 GEMMBenchmark(state,
473 xnn_qu8_gemm_minmax_rndnu_ukernel_8x8c4__neondot,
474 xnn_init_qu8_conv_minmax_rndnu_neon_params,
475 8, 8, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700476 }
477 static void qu8_gemm_1x16c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700478 GEMMBenchmark(state,
479 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16c4__neondot,
480 xnn_init_qu8_conv_minmax_rndnu_neon_params,
481 1, 16, 4, 1, benchmark::utils::CheckNEONDOT);
482 }
483 static void qu8_gemm_2x16c4__neondot(benchmark::State& state, const char* net) {
484 GEMMBenchmark(state,
485 xnn_qu8_gemm_minmax_rndnu_ukernel_2x16c4__neondot,
486 xnn_init_qu8_conv_minmax_rndnu_neon_params,
487 2, 16, 4, 1, benchmark::utils::CheckNEONDOT);
488 }
489 static void qu8_gemm_3x16c4__neondot(benchmark::State& state, const char* net) {
490 GEMMBenchmark(state,
491 xnn_qu8_gemm_minmax_rndnu_ukernel_3x16c4__neondot,
492 xnn_init_qu8_conv_minmax_rndnu_neon_params,
493 3, 16, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700494 }
495 static void qu8_gemm_4x16c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700496 GEMMBenchmark(state,
497 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16c4__neondot,
498 xnn_init_qu8_conv_minmax_rndnu_neon_params,
499 4, 16, 4, 1, benchmark::utils::CheckNEONDOT);
500 }
501 static void qu8_gemm_5x16c4__neondot(benchmark::State& state, const char* net) {
502 GEMMBenchmark(state,
503 xnn_qu8_gemm_minmax_rndnu_ukernel_5x16c4__neondot,
504 xnn_init_qu8_conv_minmax_rndnu_neon_params,
505 5, 16, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700506 }
507 static void qu8_gemm_6x16c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700508 GEMMBenchmark(state,
509 xnn_qu8_gemm_minmax_rndnu_ukernel_6x16c4__neondot,
510 xnn_init_qu8_conv_minmax_rndnu_neon_params,
511 6, 16, 4, 1, benchmark::utils::CheckNEONDOT);
Frank Barchard88e839c2021-08-11 00:12:31 -0700512 }
513 static void qu8_gemm_8x16c4__neondot(benchmark::State& state, const char* net) {
Frank Barcharde0331262021-08-11 23:18:59 -0700514 GEMMBenchmark(state,
515 xnn_qu8_gemm_minmax_rndnu_ukernel_8x16c4__neondot,
516 xnn_init_qu8_conv_minmax_rndnu_neon_params,
517 8, 16, 4, 1, benchmark::utils::CheckNEONDOT);
518 }
519 static void qu8_gemm_1x8__neon_mlal_lane(benchmark::State& state, const char* net) {
520 GEMMBenchmark(state,
521 xnn_qu8_gemm_minmax_rndnu_ukernel_1x8__neon_mlal_lane,
522 xnn_init_qu8_conv_minmax_rndnu_neon_params,
523 1, 8, 1, 1, benchmark::utils::CheckNEON);
Frank Barchard88e839c2021-08-11 00:12:31 -0700524 }
Frank Barchardd5a53332022-01-10 03:44:40 -0800525 static void qu8_gemm_2x8__neon_mlal_lane(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -0700526 GEMMBenchmark(state,
Frank Barchardd5a53332022-01-10 03:44:40 -0800527 xnn_qu8_gemm_minmax_rndnu_ukernel_2x8__neon_mlal_lane,
Marat Dukhanafd2ed92021-07-27 00:21:05 -0700528 xnn_init_qu8_conv_minmax_rndnu_neon_params,
Frank Barchardd5a53332022-01-10 03:44:40 -0800529 2, 8, 1, 1, benchmark::utils::CheckNEON);
530 }
531 static void qu8_gemm_3x8__neon_mlal_lane(benchmark::State& state, const char* net) {
532 GEMMBenchmark(state,
533 xnn_qu8_gemm_minmax_rndnu_ukernel_3x8__neon_mlal_lane,
534 xnn_init_qu8_conv_minmax_rndnu_neon_params,
535 3, 8, 1, 1, benchmark::utils::CheckNEON);
Frank Barcharde0331262021-08-11 23:18:59 -0700536 }
537 static void qu8_gemm_4x8__neon_mlal_lane(benchmark::State& state, const char* net) {
538 GEMMBenchmark(state,
539 xnn_qu8_gemm_minmax_rndnu_ukernel_4x8__neon_mlal_lane,
540 xnn_init_qu8_conv_minmax_rndnu_neon_params,
541 4, 8, 1, 1, benchmark::utils::CheckNEON);
Marat Dukhand8e2d712021-07-26 23:35:50 -0700542 }
Frank Barchardd5a53332022-01-10 03:44:40 -0800543 static void qu8_gemm_6x8__neon_mlal_lane(benchmark::State& state, const char* net) {
544 GEMMBenchmark(state,
545 xnn_qu8_gemm_minmax_rndnu_ukernel_6x8__neon_mlal_lane,
546 xnn_init_qu8_conv_minmax_rndnu_neon_params,
547 6, 8, 1, 1, benchmark::utils::CheckNEON);
548 }
549 static void qu8_gemm_1x16__neon_mlal_lane(benchmark::State& state, const char* net) {
550 GEMMBenchmark(state,
551 xnn_qu8_gemm_minmax_rndnu_ukernel_1x16__neon_mlal_lane,
552 xnn_init_qu8_conv_minmax_rndnu_neon_params,
553 1, 16, 1, 1, benchmark::utils::CheckNEON);
554 }
555 static void qu8_gemm_2x16__neon_mlal_lane(benchmark::State& state, const char* net) {
556 GEMMBenchmark(state,
557 xnn_qu8_gemm_minmax_rndnu_ukernel_2x16__neon_mlal_lane,
558 xnn_init_qu8_conv_minmax_rndnu_neon_params,
559 2, 16, 1, 1, benchmark::utils::CheckNEON);
560 }
561 static void qu8_gemm_3x16__neon_mlal_lane(benchmark::State& state, const char* net) {
562 GEMMBenchmark(state,
563 xnn_qu8_gemm_minmax_rndnu_ukernel_3x16__neon_mlal_lane,
564 xnn_init_qu8_conv_minmax_rndnu_neon_params,
565 3, 16, 1, 1, benchmark::utils::CheckNEON);
566 }
Marat Dukhand8e2d712021-07-26 23:35:50 -0700567 static void qu8_gemm_4x16__neon_mlal_lane(benchmark::State& state, const char* net) {
568 GEMMBenchmark(state,
Marat Dukhanafd2ed92021-07-27 00:21:05 -0700569 xnn_qu8_gemm_minmax_rndnu_ukernel_4x16__neon_mlal_lane,
570 xnn_init_qu8_conv_minmax_rndnu_neon_params,
Frank Barcharde0331262021-08-11 23:18:59 -0700571 4, 16, 1, 1, benchmark::utils::CheckNEON);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700572 }
Frank Barchardd5a53332022-01-10 03:44:40 -0800573 static void qu8_gemm_6x16__neon_mlal_lane(benchmark::State& state, const char* net) {
574 GEMMBenchmark(state,
575 xnn_qu8_gemm_minmax_rndnu_ukernel_6x16__neon_mlal_lane,
576 xnn_init_qu8_conv_minmax_rndnu_neon_params,
577 6, 16, 1, 1, benchmark::utils::CheckNEON);
578 }
Frank Barchardcdf59a52021-09-08 13:55:24 -0700579 static void qu8_gemm_1x32c4__neondot(benchmark::State& state, const char* net) {
580 GEMMBenchmark(state,
581 xnn_qu8_gemm_minmax_rndnu_ukernel_1x32c4__neondot,
582 xnn_init_qu8_conv_minmax_rndnu_neon_params,
583 1, 32, 4, 1, benchmark::utils::CheckNEONDOT);
584 }
585 static void qu8_gemm_2x32c4__neondot(benchmark::State& state, const char* net) {
586 GEMMBenchmark(state,
587 xnn_qu8_gemm_minmax_rndnu_ukernel_2x32c4__neondot,
588 xnn_init_qu8_conv_minmax_rndnu_neon_params,
589 2, 32, 4, 1, benchmark::utils::CheckNEONDOT);
590 }
591 static void qu8_gemm_3x32c4__neondot(benchmark::State& state, const char* net) {
592 GEMMBenchmark(state,
593 xnn_qu8_gemm_minmax_rndnu_ukernel_3x32c4__neondot,
594 xnn_init_qu8_conv_minmax_rndnu_neon_params,
595 3, 32, 4, 1, benchmark::utils::CheckNEONDOT);
596 }
Frank Barchard88e839c2021-08-11 00:12:31 -0700597 BENCHMARK_GEMM(qu8_gemm_1x8c4__neondot)
Frank Barcharde0331262021-08-11 23:18:59 -0700598 BENCHMARK_GEMM(qu8_gemm_2x8c4__neondot)
599 BENCHMARK_GEMM(qu8_gemm_3x8c4__neondot)
Frank Barchard88e839c2021-08-11 00:12:31 -0700600 BENCHMARK_GEMM(qu8_gemm_4x8c4__neondot)
Frank Barcharde0331262021-08-11 23:18:59 -0700601 BENCHMARK_GEMM(qu8_gemm_5x8c4__neondot)
Frank Barchard88e839c2021-08-11 00:12:31 -0700602 BENCHMARK_GEMM(qu8_gemm_6x8c4__neondot)
603 BENCHMARK_GEMM(qu8_gemm_8x8c4__neondot)
604 BENCHMARK_GEMM(qu8_gemm_1x16c4__neondot)
Frank Barcharde0331262021-08-11 23:18:59 -0700605 BENCHMARK_GEMM(qu8_gemm_2x16c4__neondot)
606 BENCHMARK_GEMM(qu8_gemm_3x16c4__neondot)
Frank Barchard88e839c2021-08-11 00:12:31 -0700607 BENCHMARK_GEMM(qu8_gemm_4x16c4__neondot)
Frank Barcharde0331262021-08-11 23:18:59 -0700608 BENCHMARK_GEMM(qu8_gemm_5x16c4__neondot)
Frank Barchard88e839c2021-08-11 00:12:31 -0700609 BENCHMARK_GEMM(qu8_gemm_6x16c4__neondot)
610 BENCHMARK_GEMM(qu8_gemm_8x16c4__neondot)
Frank Barchardcdf59a52021-09-08 13:55:24 -0700611 BENCHMARK_GEMM(qu8_gemm_1x32c4__neondot)
612 BENCHMARK_GEMM(qu8_gemm_2x32c4__neondot)
613 BENCHMARK_GEMM(qu8_gemm_3x32c4__neondot)
Frank Barcharde0331262021-08-11 23:18:59 -0700614 BENCHMARK_GEMM(qu8_gemm_1x8__neon_mlal_lane)
Frank Barchardd5a53332022-01-10 03:44:40 -0800615 BENCHMARK_GEMM(qu8_gemm_2x8__neon_mlal_lane)
616 BENCHMARK_GEMM(qu8_gemm_3x8__neon_mlal_lane)
Frank Barcharde0331262021-08-11 23:18:59 -0700617 BENCHMARK_GEMM(qu8_gemm_4x8__neon_mlal_lane)
Frank Barchardd5a53332022-01-10 03:44:40 -0800618 BENCHMARK_GEMM(qu8_gemm_6x8__neon_mlal_lane)
619 BENCHMARK_GEMM(qu8_gemm_1x16__neon_mlal_lane)
620 BENCHMARK_GEMM(qu8_gemm_2x16__neon_mlal_lane)
621 BENCHMARK_GEMM(qu8_gemm_3x16__neon_mlal_lane)
Marat Dukhand8e2d712021-07-26 23:35:50 -0700622 BENCHMARK_GEMM(qu8_gemm_4x16__neon_mlal_lane)
Frank Barchardd5a53332022-01-10 03:44:40 -0800623 BENCHMARK_GEMM(qu8_gemm_6x16__neon_mlal_lane)
Marat Dukhand8e2d712021-07-26 23:35:50 -0700624#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
625
Frank Barchardca4c68e2021-08-25 19:06:40 -0700626
Marat Dukhand8e2d712021-07-26 23:35:50 -0700627#if XNN_ARCH_X86 || XNN_ARCH_X86_64
628 static void qu8_gemm_1x16c8__avx512skx(benchmark::State& state, const char* net) {
629 GEMMBenchmark(state,
630 xnn_qu8_gemm_minmax_fp32_ukernel_1x16c8__avx512skx,
631 xnn_init_qu8_conv_minmax_fp32_avx512_params,
632 1, 16, 8, 1,
633 benchmark::utils::CheckAVX512SKX);
634 }
635 static void qu8_gemm_2x16c8__avx512skx(benchmark::State& state, const char* net) {
636 GEMMBenchmark(state,
637 xnn_qu8_gemm_minmax_fp32_ukernel_2x16c8__avx512skx,
638 xnn_init_qu8_conv_minmax_fp32_avx512_params,
639 2, 16, 8, 1,
640 benchmark::utils::CheckAVX512SKX);
641 }
642 static void qu8_gemm_3x16c8__avx512skx(benchmark::State& state, const char* net) {
643 GEMMBenchmark(state,
644 xnn_qu8_gemm_minmax_fp32_ukernel_3x16c8__avx512skx,
645 xnn_init_qu8_conv_minmax_fp32_avx512_params,
646 3, 16, 8, 1,
647 benchmark::utils::CheckAVX512SKX);
648 }
649 static void qu8_gemm_4x16c8__avx512skx(benchmark::State& state, const char* net) {
650 GEMMBenchmark(state,
651 xnn_qu8_gemm_minmax_fp32_ukernel_4x16c8__avx512skx,
652 xnn_init_qu8_conv_minmax_fp32_avx512_params,
653 4, 16, 8, 1,
654 benchmark::utils::CheckAVX512SKX);
655 }
656 static void qu8_gemm_1x8c8__avx2(benchmark::State& state, const char* net) {
657 GEMMBenchmark(state,
658 xnn_qu8_gemm_minmax_fp32_ukernel_1x8c8__avx2,
659 xnn_init_qu8_conv_minmax_fp32_avx2_params,
660 1, 8, 8, 1,
661 benchmark::utils::CheckAVX2);
662 }
663 static void qu8_gemm_2x8c8__avx2(benchmark::State& state, const char* net) {
664 GEMMBenchmark(state,
665 xnn_qu8_gemm_minmax_fp32_ukernel_2x8c8__avx2,
666 xnn_init_qu8_conv_minmax_fp32_avx2_params,
667 2, 8, 8, 1,
668 benchmark::utils::CheckAVX2);
669 }
670 static void qu8_gemm_3x8c8__avx2(benchmark::State& state, const char* net) {
671 GEMMBenchmark(state,
672 xnn_qu8_gemm_minmax_fp32_ukernel_3x8c8__avx2,
673 xnn_init_qu8_conv_minmax_fp32_avx2_params,
674 3, 8, 8, 1,
675 benchmark::utils::CheckAVX2);
676 }
677 static void qu8_gemm_1x4c2__xop_ld64(benchmark::State& state, const char* net) {
678 GEMMBenchmark(state,
679 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
680 xnn_init_qu8_conv_minmax_fp32_sse2_params,
681 1, 4, 2, 1,
682 benchmark::utils::CheckXOP);
683 }
684 static void qu8_gemm_2x4c2__xop_ld64(benchmark::State& state, const char* net) {
685 GEMMBenchmark(state,
686 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
687 xnn_init_qu8_conv_minmax_fp32_sse2_params,
688 2, 4, 2, 1,
689 benchmark::utils::CheckXOP);
690 }
691 static void qu8_gemm_3x4c2__xop_ld64(benchmark::State& state, const char* net) {
692 GEMMBenchmark(state,
693 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
694 xnn_init_qu8_conv_minmax_fp32_sse2_params,
695 3, 4, 2, 1,
696 benchmark::utils::CheckXOP);
697 }
698 static void qu8_gemm_4x4c2__xop_ld64(benchmark::State& state, const char* net) {
699 GEMMBenchmark(state,
700 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
701 xnn_init_qu8_conv_minmax_fp32_sse2_params,
702 4, 4, 2, 1,
703 benchmark::utils::CheckXOP);
704 }
705 static void qu8_gemm_1x4c2__xop_ld128(benchmark::State& state, const char* net) {
706 GEMMBenchmark(state,
707 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
708 xnn_init_qu8_conv_minmax_fp32_sse2_params,
709 1, 4, 2, 1,
710 benchmark::utils::CheckXOP);
711 }
712 static void qu8_gemm_2x4c2__xop_ld128(benchmark::State& state, const char* net) {
713 GEMMBenchmark(state,
714 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
715 xnn_init_qu8_conv_minmax_fp32_sse2_params,
716 2, 4, 2, 1,
717 benchmark::utils::CheckXOP);
718 }
719 static void qu8_gemm_3x4c2__xop_ld128(benchmark::State& state, const char* net) {
720 GEMMBenchmark(state,
721 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
722 xnn_init_qu8_conv_minmax_fp32_sse2_params,
723 3, 4, 2, 1,
724 benchmark::utils::CheckXOP);
725 }
726 static void qu8_gemm_4x4c2__xop_ld128(benchmark::State& state, const char* net) {
727 GEMMBenchmark(state,
728 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
729 xnn_init_qu8_conv_minmax_fp32_sse2_params,
730 4, 4, 2, 1,
731 benchmark::utils::CheckXOP);
732 }
733 static void qu8_gemm_1x4c8__xop_ld64(benchmark::State& state, const char* net) {
734 GEMMBenchmark(state,
735 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld64,
736 xnn_init_qu8_conv_minmax_fp32_sse2_params,
737 1, 4, 8, 1,
738 benchmark::utils::CheckXOP);
739 }
740 static void qu8_gemm_2x4c8__xop_ld64(benchmark::State& state, const char* net) {
741 GEMMBenchmark(state,
742 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld64,
743 xnn_init_qu8_conv_minmax_fp32_sse2_params,
744 2, 4, 8, 1,
745 benchmark::utils::CheckXOP);
746 }
747 static void qu8_gemm_3x4c8__xop_ld64(benchmark::State& state, const char* net) {
748 GEMMBenchmark(state,
749 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld64,
750 xnn_init_qu8_conv_minmax_fp32_sse2_params,
751 3, 4, 8, 1,
752 benchmark::utils::CheckXOP);
753 }
754 static void qu8_gemm_1x4c8__xop_ld128(benchmark::State& state, const char* net) {
755 GEMMBenchmark(state,
756 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__xop_ld128,
757 xnn_init_qu8_conv_minmax_fp32_sse2_params,
758 1, 4, 8, 1,
759 benchmark::utils::CheckXOP);
760 }
761 static void qu8_gemm_2x4c8__xop_ld128(benchmark::State& state, const char* net) {
762 GEMMBenchmark(state,
763 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__xop_ld128,
764 xnn_init_qu8_conv_minmax_fp32_sse2_params,
765 2, 4, 8, 1,
766 benchmark::utils::CheckXOP);
767 }
768 static void qu8_gemm_3x4c8__xop_ld128(benchmark::State& state, const char* net) {
769 GEMMBenchmark(state,
770 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__xop_ld128,
771 xnn_init_qu8_conv_minmax_fp32_sse2_params,
772 3, 4, 8, 1,
773 benchmark::utils::CheckXOP);
774 }
775 static void qu8_gemm_1x4c2__avx_ld64(benchmark::State& state, const char* net) {
776 GEMMBenchmark(state,
777 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
778 xnn_init_qu8_conv_minmax_fp32_sse2_params,
779 1, 4, 2, 1,
780 benchmark::utils::CheckAVX);
781 }
782 static void qu8_gemm_2x4c2__avx_ld64(benchmark::State& state, const char* net) {
783 GEMMBenchmark(state,
784 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
785 xnn_init_qu8_conv_minmax_fp32_sse2_params,
786 2, 4, 2, 1,
787 benchmark::utils::CheckAVX);
788 }
789 static void qu8_gemm_3x4c2__avx_ld64(benchmark::State& state, const char* net) {
790 GEMMBenchmark(state,
791 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
792 xnn_init_qu8_conv_minmax_fp32_sse2_params,
793 3, 4, 2, 1,
794 benchmark::utils::CheckAVX);
795 }
796 static void qu8_gemm_4x4c2__avx_ld64(benchmark::State& state, const char* net) {
797 GEMMBenchmark(state,
798 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
799 xnn_init_qu8_conv_minmax_fp32_sse2_params,
800 4, 4, 2, 1,
801 benchmark::utils::CheckAVX);
802 }
803 static void qu8_gemm_1x4c2__avx_ld128(benchmark::State& state, const char* net) {
804 GEMMBenchmark(state,
805 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
806 xnn_init_qu8_conv_minmax_fp32_sse2_params,
807 1, 4, 2, 1,
808 benchmark::utils::CheckAVX);
809 }
810 static void qu8_gemm_2x4c2__avx_ld128(benchmark::State& state, const char* net) {
811 GEMMBenchmark(state,
812 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
813 xnn_init_qu8_conv_minmax_fp32_sse2_params,
814 2, 4, 2, 1,
815 benchmark::utils::CheckAVX);
816 }
817 static void qu8_gemm_3x4c2__avx_ld128(benchmark::State& state, const char* net) {
818 GEMMBenchmark(state,
819 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
820 xnn_init_qu8_conv_minmax_fp32_sse2_params,
821 3, 4, 2, 1,
822 benchmark::utils::CheckAVX);
823 }
824 static void qu8_gemm_4x4c2__avx_ld128(benchmark::State& state, const char* net) {
825 GEMMBenchmark(state,
826 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
827 xnn_init_qu8_conv_minmax_fp32_sse2_params,
828 4, 4, 2, 1,
829 benchmark::utils::CheckAVX);
830 }
831 static void qu8_gemm_1x4c8__avx_ld64(benchmark::State& state, const char* net) {
832 GEMMBenchmark(state,
833 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld64,
834 xnn_init_qu8_conv_minmax_fp32_sse2_params,
835 1, 4, 8, 1,
836 benchmark::utils::CheckAVX);
837 }
838 static void qu8_gemm_2x4c8__avx_ld64(benchmark::State& state, const char* net) {
839 GEMMBenchmark(state,
840 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld64,
841 xnn_init_qu8_conv_minmax_fp32_sse2_params,
842 2, 4, 8, 1,
843 benchmark::utils::CheckAVX);
844 }
845 static void qu8_gemm_3x4c8__avx_ld64(benchmark::State& state, const char* net) {
846 GEMMBenchmark(state,
847 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld64,
848 xnn_init_qu8_conv_minmax_fp32_sse2_params,
849 3, 4, 8, 1,
850 benchmark::utils::CheckAVX);
851 }
852 static void qu8_gemm_1x4c8__avx_ld128(benchmark::State& state, const char* net) {
853 GEMMBenchmark(state,
854 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128,
855 xnn_init_qu8_conv_minmax_fp32_sse2_params,
856 1, 4, 8, 1,
857 benchmark::utils::CheckAVX);
858 }
859 static void qu8_gemm_2x4c8__avx_ld128(benchmark::State& state, const char* net) {
860 GEMMBenchmark(state,
861 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128,
862 xnn_init_qu8_conv_minmax_fp32_sse2_params,
863 2, 4, 8, 1,
864 benchmark::utils::CheckAVX);
865 }
866 static void qu8_gemm_3x4c8__avx_ld128(benchmark::State& state, const char* net) {
867 GEMMBenchmark(state,
868 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__avx_ld128,
869 xnn_init_qu8_conv_minmax_fp32_sse2_params,
870 3, 4, 8, 1,
871 benchmark::utils::CheckAVX);
872 }
873 static void qu8_gemm_1x4c2__sse41_ld64(benchmark::State& state, const char* net) {
874 GEMMBenchmark(state,
875 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
876 xnn_init_qu8_conv_minmax_fp32_sse2_params,
877 1, 4, 2, 1,
878 benchmark::utils::CheckSSE41);
879 }
880 static void qu8_gemm_2x4c2__sse41_ld64(benchmark::State& state, const char* net) {
881 GEMMBenchmark(state,
882 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
883 xnn_init_qu8_conv_minmax_fp32_sse2_params,
884 2, 4, 2, 1,
885 benchmark::utils::CheckSSE41);
886 }
887 static void qu8_gemm_3x4c2__sse41_ld64(benchmark::State& state, const char* net) {
888 GEMMBenchmark(state,
889 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
890 xnn_init_qu8_conv_minmax_fp32_sse2_params,
891 3, 4, 2, 1,
892 benchmark::utils::CheckSSE41);
893 }
894 static void qu8_gemm_4x4c2__sse41_ld64(benchmark::State& state, const char* net) {
895 GEMMBenchmark(state,
896 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
897 xnn_init_qu8_conv_minmax_fp32_sse2_params,
898 4, 4, 2, 1,
899 benchmark::utils::CheckSSE41);
900 }
901 static void qu8_gemm_1x4c2__sse41_ld128(benchmark::State& state, const char* net) {
902 GEMMBenchmark(state,
903 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
904 xnn_init_qu8_conv_minmax_fp32_sse2_params,
905 1, 4, 2, 1,
906 benchmark::utils::CheckSSE41);
907 }
908 static void qu8_gemm_2x4c2__sse41_ld128(benchmark::State& state, const char* net) {
909 GEMMBenchmark(state,
910 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
911 xnn_init_qu8_conv_minmax_fp32_sse2_params,
912 2, 4, 2, 1,
913 benchmark::utils::CheckSSE41);
914 }
915 static void qu8_gemm_3x4c2__sse41_ld128(benchmark::State& state, const char* net) {
916 GEMMBenchmark(state,
917 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
918 xnn_init_qu8_conv_minmax_fp32_sse2_params,
919 3, 4, 2, 1,
920 benchmark::utils::CheckSSE41);
921 }
922 static void qu8_gemm_4x4c2__sse41_ld128(benchmark::State& state, const char* net) {
923 GEMMBenchmark(state,
924 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
925 xnn_init_qu8_conv_minmax_fp32_sse2_params,
926 4, 4, 2, 1,
927 benchmark::utils::CheckSSE41);
928 }
929 static void qu8_gemm_1x4c8__sse41_ld64(benchmark::State& state, const char* net) {
930 GEMMBenchmark(state,
931 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld64,
932 xnn_init_qu8_conv_minmax_fp32_sse2_params,
933 1, 4, 8, 1,
934 benchmark::utils::CheckSSE41);
935 }
936 static void qu8_gemm_2x4c8__sse41_ld64(benchmark::State& state, const char* net) {
937 GEMMBenchmark(state,
938 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld64,
939 xnn_init_qu8_conv_minmax_fp32_sse2_params,
940 2, 4, 8, 1,
941 benchmark::utils::CheckSSE41);
942 }
943 static void qu8_gemm_3x4c8__sse41_ld64(benchmark::State& state, const char* net) {
944 GEMMBenchmark(state,
945 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld64,
946 xnn_init_qu8_conv_minmax_fp32_sse2_params,
947 3, 4, 8, 1,
948 benchmark::utils::CheckSSE41);
949 }
950 static void qu8_gemm_1x4c8__sse41_ld128(benchmark::State& state, const char* net) {
951 GEMMBenchmark(state,
952 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse41_ld128,
953 xnn_init_qu8_conv_minmax_fp32_sse2_params,
954 1, 4, 8, 1,
955 benchmark::utils::CheckSSE41);
956 }
957 static void qu8_gemm_2x4c8__sse41_ld128(benchmark::State& state, const char* net) {
958 GEMMBenchmark(state,
959 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse41_ld128,
960 xnn_init_qu8_conv_minmax_fp32_sse2_params,
961 2, 4, 8, 1,
962 benchmark::utils::CheckSSE41);
963 }
964 static void qu8_gemm_3x4c8__sse41_ld128(benchmark::State& state, const char* net) {
965 GEMMBenchmark(state,
966 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse41_ld128,
967 xnn_init_qu8_conv_minmax_fp32_sse2_params,
968 3, 4, 8, 1,
969 benchmark::utils::CheckSSE41);
970 }
971 static void qu8_gemm_1x4c2__sse2_ld64(benchmark::State& state, const char* net) {
972 GEMMBenchmark(state,
973 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
974 xnn_init_qu8_conv_minmax_fp32_sse2_params,
975 1, 4, 2, 1);
976 }
977 static void qu8_gemm_2x4c2__sse2_ld64(benchmark::State& state, const char* net) {
978 GEMMBenchmark(state,
979 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
980 xnn_init_qu8_conv_minmax_fp32_sse2_params,
981 2, 4, 2, 1);
982 }
983 static void qu8_gemm_3x4c2__sse2_ld64(benchmark::State& state, const char* net) {
984 GEMMBenchmark(state,
985 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
986 xnn_init_qu8_conv_minmax_fp32_sse2_params,
987 3, 4, 2, 1);
988 }
989 static void qu8_gemm_4x4c2__sse2_ld64(benchmark::State& state, const char* net) {
990 GEMMBenchmark(state,
991 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
992 xnn_init_qu8_conv_minmax_fp32_sse2_params,
993 4, 4, 2, 1);
994 }
995 static void qu8_gemm_1x4c2__sse2_ld128(benchmark::State& state, const char* net) {
996 GEMMBenchmark(state,
997 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
998 xnn_init_qu8_conv_minmax_fp32_sse2_params,
999 1, 4, 2, 1);
1000 }
1001 static void qu8_gemm_2x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1002 GEMMBenchmark(state,
1003 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1004 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1005 2, 4, 2, 1);
1006 }
1007 static void qu8_gemm_3x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1008 GEMMBenchmark(state,
1009 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1010 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1011 3, 4, 2, 1);
1012 }
1013 static void qu8_gemm_4x4c2__sse2_ld128(benchmark::State& state, const char* net) {
1014 GEMMBenchmark(state,
1015 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1016 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1017 4, 4, 2, 1);
1018 }
1019 static void qu8_gemm_1x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1020 GEMMBenchmark(state,
1021 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld64,
1022 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1023 1, 4, 8, 1);
1024 }
1025 static void qu8_gemm_2x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1026 GEMMBenchmark(state,
1027 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld64,
1028 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1029 2, 4, 8, 1);
1030 }
1031 static void qu8_gemm_3x4c8__sse2_ld64(benchmark::State& state, const char* net) {
1032 GEMMBenchmark(state,
1033 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld64,
1034 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1035 3, 4, 8, 1);
1036 }
1037 static void qu8_gemm_1x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1038 GEMMBenchmark(state,
1039 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__sse2_ld128,
1040 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1041 1, 4, 8, 1);
1042 }
1043 static void qu8_gemm_2x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1044 GEMMBenchmark(state,
1045 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__sse2_ld128,
1046 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1047 2, 4, 8, 1);
1048 }
1049 static void qu8_gemm_3x4c8__sse2_ld128(benchmark::State& state, const char* net) {
1050 GEMMBenchmark(state,
1051 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__sse2_ld128,
1052 xnn_init_qu8_conv_minmax_fp32_sse2_params,
1053 3, 4, 8, 1);
1054 }
1055
1056 BENCHMARK_GEMM(qu8_gemm_1x16c8__avx512skx)
1057 BENCHMARK_GEMM(qu8_gemm_2x16c8__avx512skx)
1058 BENCHMARK_GEMM(qu8_gemm_3x16c8__avx512skx)
1059 BENCHMARK_GEMM(qu8_gemm_4x16c8__avx512skx)
1060
1061 BENCHMARK_GEMM(qu8_gemm_1x8c8__avx2)
1062 BENCHMARK_GEMM(qu8_gemm_2x8c8__avx2)
1063 BENCHMARK_GEMM(qu8_gemm_3x8c8__avx2)
1064
1065 BENCHMARK_GEMM(qu8_gemm_1x4c2__xop_ld64)
1066 BENCHMARK_GEMM(qu8_gemm_2x4c2__xop_ld64)
1067 BENCHMARK_GEMM(qu8_gemm_3x4c2__xop_ld64)
1068 BENCHMARK_GEMM(qu8_gemm_4x4c2__xop_ld64)
1069 BENCHMARK_GEMM(qu8_gemm_1x4c2__xop_ld128)
1070 BENCHMARK_GEMM(qu8_gemm_2x4c2__xop_ld128)
1071 BENCHMARK_GEMM(qu8_gemm_3x4c2__xop_ld128)
1072 BENCHMARK_GEMM(qu8_gemm_4x4c2__xop_ld128)
1073 BENCHMARK_GEMM(qu8_gemm_1x4c8__xop_ld64)
1074 BENCHMARK_GEMM(qu8_gemm_2x4c8__xop_ld64)
1075 BENCHMARK_GEMM(qu8_gemm_3x4c8__xop_ld64)
1076 BENCHMARK_GEMM(qu8_gemm_1x4c8__xop_ld128)
1077 BENCHMARK_GEMM(qu8_gemm_2x4c8__xop_ld128)
1078 BENCHMARK_GEMM(qu8_gemm_3x4c8__xop_ld128)
1079
1080 BENCHMARK_GEMM(qu8_gemm_1x4c2__avx_ld64)
1081 BENCHMARK_GEMM(qu8_gemm_2x4c2__avx_ld64)
1082 BENCHMARK_GEMM(qu8_gemm_3x4c2__avx_ld64)
1083 BENCHMARK_GEMM(qu8_gemm_4x4c2__avx_ld64)
1084 BENCHMARK_GEMM(qu8_gemm_1x4c2__avx_ld128)
1085 BENCHMARK_GEMM(qu8_gemm_2x4c2__avx_ld128)
1086 BENCHMARK_GEMM(qu8_gemm_3x4c2__avx_ld128)
1087 BENCHMARK_GEMM(qu8_gemm_4x4c2__avx_ld128)
1088 BENCHMARK_GEMM(qu8_gemm_1x4c8__avx_ld64)
1089 BENCHMARK_GEMM(qu8_gemm_2x4c8__avx_ld64)
1090 BENCHMARK_GEMM(qu8_gemm_3x4c8__avx_ld64)
1091 BENCHMARK_GEMM(qu8_gemm_1x4c8__avx_ld128)
1092 BENCHMARK_GEMM(qu8_gemm_2x4c8__avx_ld128)
1093 BENCHMARK_GEMM(qu8_gemm_3x4c8__avx_ld128)
1094
1095 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse41_ld64)
1096 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse41_ld64)
1097 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse41_ld64)
1098 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse41_ld64)
1099 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse41_ld128)
1100 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse41_ld128)
1101 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse41_ld128)
1102 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse41_ld128)
1103 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse41_ld64)
1104 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse41_ld64)
1105 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse41_ld64)
1106 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse41_ld128)
1107 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse41_ld128)
1108 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse41_ld128)
1109
1110 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse2_ld64)
1111 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse2_ld64)
1112 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse2_ld64)
1113 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2_ld64)
1114 BENCHMARK_GEMM(qu8_gemm_1x4c2__sse2_ld128)
1115 BENCHMARK_GEMM(qu8_gemm_2x4c2__sse2_ld128)
1116 BENCHMARK_GEMM(qu8_gemm_3x4c2__sse2_ld128)
1117 BENCHMARK_GEMM(qu8_gemm_4x4c2__sse2_ld128)
1118 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse2_ld64)
1119 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2_ld64)
1120 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse2_ld64)
1121 BENCHMARK_GEMM(qu8_gemm_1x4c8__sse2_ld128)
1122 BENCHMARK_GEMM(qu8_gemm_2x4c8__sse2_ld128)
1123 BENCHMARK_GEMM(qu8_gemm_3x4c8__sse2_ld128)
Marat Dukhan725f47e2021-05-22 10:06:19 -07001124#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
1125
1126
Marat Dukhan4c617792021-12-21 15:47:58 -08001127#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhan348c3772022-02-01 00:36:50 -08001128 static void qu8_gemm_1x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1129 GEMMBenchmark(state,
1130 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld64,
1131 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1132 1, 4, 2, 1);
1133 }
1134 static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1135 GEMMBenchmark(state,
1136 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld64,
1137 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1138 2, 4, 2, 1);
1139 }
1140 static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1141 GEMMBenchmark(state,
1142 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld64,
1143 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1144 3, 4, 2, 1);
1145 }
1146 static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1147 GEMMBenchmark(state,
1148 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld64,
1149 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1150 4, 4, 2, 1);
1151 }
1152
1153 static void qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1154 GEMMBenchmark(state,
1155 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2__wasmsimd_dot16x2_ld128,
1156 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1157 1, 4, 2, 1);
1158 }
1159 static void qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1160 GEMMBenchmark(state,
1161 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2__wasmsimd_dot16x2_ld128,
1162 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1163 2, 4, 2, 1);
1164 }
1165 static void qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1166 GEMMBenchmark(state,
1167 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2__wasmsimd_dot16x2_ld128,
1168 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1169 3, 4, 2, 1);
1170 }
1171 static void qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1172 GEMMBenchmark(state,
1173 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2__wasmsimd_dot16x2_ld128,
1174 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1175 4, 4, 2, 1);
1176 }
1177
1178 static void qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1179 GEMMBenchmark(state,
1180 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld64,
1181 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1182 1, 4, 2, 4);
1183 }
1184 static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1185 GEMMBenchmark(state,
1186 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld64,
1187 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1188 2, 4, 2, 4);
1189 }
1190 static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1191 GEMMBenchmark(state,
1192 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld64,
1193 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1194 3, 4, 2, 4);
1195 }
1196 static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1197 GEMMBenchmark(state,
1198 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld64,
1199 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1200 4, 4, 2, 4);
1201 }
1202
1203 static void qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1204 GEMMBenchmark(state,
1205 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c2s4__wasmsimd_dot16x2_ld128,
1206 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1207 1, 4, 2, 4);
1208 }
1209 static void qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1210 GEMMBenchmark(state,
1211 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c2s4__wasmsimd_dot16x2_ld128,
1212 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1213 2, 4, 2, 4);
1214 }
1215 static void qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1216 GEMMBenchmark(state,
1217 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c2s4__wasmsimd_dot16x2_ld128,
1218 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1219 3, 4, 2, 4);
1220 }
1221 static void qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1222 GEMMBenchmark(state,
1223 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c2s4__wasmsimd_dot16x2_ld128,
1224 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1225 4, 4, 2, 4);
1226 }
1227
1228 static void qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1229 GEMMBenchmark(state,
1230 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld64,
1231 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1232 1, 4, 8, 1);
1233 }
1234 static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1235 GEMMBenchmark(state,
1236 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld64,
1237 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1238 2, 4, 8, 1);
1239 }
1240 static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1241 GEMMBenchmark(state,
1242 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld64,
1243 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1244 3, 4, 8, 1);
1245 }
1246 static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64(benchmark::State& state, const char* net) {
1247 GEMMBenchmark(state,
1248 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld64,
1249 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1250 4, 4, 8, 1);
1251 }
1252
1253 static void qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1254 GEMMBenchmark(state,
1255 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_dot16x2_ld128,
1256 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1257 1, 4, 8, 1);
1258 }
1259 static void qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1260 GEMMBenchmark(state,
1261 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_dot16x2_ld128,
1262 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1263 2, 4, 8, 1);
1264 }
1265 static void qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1266 GEMMBenchmark(state,
1267 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_dot16x2_ld128,
1268 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1269 3, 4, 8, 1);
1270 }
1271 static void qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128(benchmark::State& state, const char* net) {
1272 GEMMBenchmark(state,
1273 xnn_qu8_gemm_minmax_fp32_ukernel_4x4c8__wasmsimd_dot16x2_ld128,
1274 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1275 4, 4, 8, 1);
1276 }
1277
Marat Dukhandfc2db02021-08-08 21:19:07 -07001278 static void qu8_gemm_1x4c8__wasmsimd_mul32_ld64(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001279 GEMMBenchmark(state,
Marat Dukhandfc2db02021-08-08 21:19:07 -07001280 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld64,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001281 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1282 1, 4, 8, 1);
1283 }
Marat Dukhandfc2db02021-08-08 21:19:07 -07001284 static void qu8_gemm_2x4c8__wasmsimd_mul32_ld64(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001285 GEMMBenchmark(state,
Marat Dukhandfc2db02021-08-08 21:19:07 -07001286 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld64,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001287 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1288 2, 4, 8, 1);
1289 }
Marat Dukhandfc2db02021-08-08 21:19:07 -07001290 static void qu8_gemm_3x4c8__wasmsimd_mul32_ld64(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001291 GEMMBenchmark(state,
Marat Dukhandfc2db02021-08-08 21:19:07 -07001292 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld64,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001293 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1294 3, 4, 8, 1);
1295 }
Marat Dukhandfc2db02021-08-08 21:19:07 -07001296 static void qu8_gemm_1x4c8__wasmsimd_mul32_ld128(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001297 GEMMBenchmark(state,
Marat Dukhandfc2db02021-08-08 21:19:07 -07001298 xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__wasmsimd_mul32_ld128,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001299 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1300 1, 4, 8, 1);
1301 }
Marat Dukhandfc2db02021-08-08 21:19:07 -07001302 static void qu8_gemm_2x4c8__wasmsimd_mul32_ld128(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001303 GEMMBenchmark(state,
Marat Dukhandfc2db02021-08-08 21:19:07 -07001304 xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__wasmsimd_mul32_ld128,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001305 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1306 2, 4, 8, 1);
1307 }
Marat Dukhandfc2db02021-08-08 21:19:07 -07001308 static void qu8_gemm_3x4c8__wasmsimd_mul32_ld128(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001309 GEMMBenchmark(state,
Marat Dukhandfc2db02021-08-08 21:19:07 -07001310 xnn_qu8_gemm_minmax_fp32_ukernel_3x4c8__wasmsimd_mul32_ld128,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001311 xnn_init_qu8_conv_minmax_fp32_wasmsimd_params,
1312 3, 4, 8, 1);
1313 }
1314
Marat Dukhan348c3772022-02-01 00:36:50 -08001315 BENCHMARK_GEMM(qu8_gemm_1x4c2__wasmsimd_dot16x2_ld64)
1316 BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld64)
1317 BENCHMARK_GEMM(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld64)
1318 BENCHMARK_GEMM(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld64)
1319 BENCHMARK_GEMM(qu8_gemm_1x4c2__wasmsimd_dot16x2_ld128)
1320 BENCHMARK_GEMM(qu8_gemm_2x4c2__wasmsimd_dot16x2_ld128)
1321 BENCHMARK_GEMM(qu8_gemm_3x4c2__wasmsimd_dot16x2_ld128)
1322 BENCHMARK_GEMM(qu8_gemm_4x4c2__wasmsimd_dot16x2_ld128)
1323
1324 BENCHMARK_GEMM(qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld64)
1325 BENCHMARK_GEMM(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld64)
1326 BENCHMARK_GEMM(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld64)
1327 BENCHMARK_GEMM(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld64)
1328 BENCHMARK_GEMM(qu8_gemm_1x4c2s4__wasmsimd_dot16x2_ld128)
1329 BENCHMARK_GEMM(qu8_gemm_2x4c2s4__wasmsimd_dot16x2_ld128)
1330 BENCHMARK_GEMM(qu8_gemm_3x4c2s4__wasmsimd_dot16x2_ld128)
1331 BENCHMARK_GEMM(qu8_gemm_4x4c2s4__wasmsimd_dot16x2_ld128)
1332
1333 BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_dot16x2_ld64)
1334 BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld64)
1335 BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld64)
1336 BENCHMARK_GEMM(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld64)
1337 BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_dot16x2_ld128)
1338 BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_dot16x2_ld128)
1339 BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_dot16x2_ld128)
1340 BENCHMARK_GEMM(qu8_gemm_4x4c8__wasmsimd_dot16x2_ld128)
1341
Marat Dukhandfc2db02021-08-08 21:19:07 -07001342 BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_mul32_ld64)
1343 BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_mul32_ld64)
1344 BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_mul32_ld64)
1345 BENCHMARK_GEMM(qu8_gemm_1x4c8__wasmsimd_mul32_ld128)
1346 BENCHMARK_GEMM(qu8_gemm_2x4c8__wasmsimd_mul32_ld128)
1347 BENCHMARK_GEMM(qu8_gemm_3x4c8__wasmsimd_mul32_ld128)
Marat Dukhan4c617792021-12-21 15:47:58 -08001348#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
Marat Dukhand8e2d712021-07-26 23:35:50 -07001349
1350
Marat Dukhan7c1115f2022-01-04 17:18:41 -08001351#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1352 static void qu8_gemm_1x2__wasm_fmagic(benchmark::State& state, const char* net) {
1353 GEMMBenchmark(state,
1354 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__wasm_fmagic,
1355 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1356 1, 2, 1, 1);
1357 }
1358 static void qu8_gemm_2x2__wasm_fmagic(benchmark::State& state, const char* net) {
1359 GEMMBenchmark(state,
1360 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__wasm_fmagic,
1361 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1362 2, 2, 1, 1);
1363 }
1364 static void qu8_gemm_3x2__wasm_fmagic(benchmark::State& state, const char* net) {
1365 GEMMBenchmark(state,
1366 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__wasm_fmagic,
1367 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1368 3, 2, 1, 1);
1369 }
1370 static void qu8_gemm_4x2__wasm_fmagic(benchmark::State& state, const char* net) {
1371 GEMMBenchmark(state,
1372 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__wasm_fmagic,
1373 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1374 4, 2, 1, 1);
1375 }
1376 static void qu8_gemm_1x4__wasm_fmagic(benchmark::State& state, const char* net) {
1377 GEMMBenchmark(state,
1378 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__wasm_fmagic,
1379 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1380 1, 4, 1, 1);
1381 }
1382 static void qu8_gemm_2x4__wasm_fmagic(benchmark::State& state, const char* net) {
1383 GEMMBenchmark(state,
1384 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__wasm_fmagic,
1385 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1386 2, 4, 1, 1);
1387 }
1388 static void qu8_gemm_3x4__wasm_fmagic(benchmark::State& state, const char* net) {
1389 GEMMBenchmark(state,
1390 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__wasm_fmagic,
1391 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1392 3, 4, 1, 1);
1393 }
1394 static void qu8_gemm_4x4__wasm_fmagic(benchmark::State& state, const char* net) {
1395 GEMMBenchmark(state,
1396 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__wasm_fmagic,
1397 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
1398 4, 4, 1, 1);
1399 }
1400
1401 BENCHMARK_GEMM(qu8_gemm_1x2__wasm_fmagic)
1402 BENCHMARK_GEMM(qu8_gemm_2x2__wasm_fmagic)
1403 BENCHMARK_GEMM(qu8_gemm_3x2__wasm_fmagic)
1404 BENCHMARK_GEMM(qu8_gemm_4x2__wasm_fmagic)
1405 BENCHMARK_GEMM(qu8_gemm_1x4__wasm_fmagic)
1406 BENCHMARK_GEMM(qu8_gemm_2x4__wasm_fmagic)
1407 BENCHMARK_GEMM(qu8_gemm_3x4__wasm_fmagic)
1408 BENCHMARK_GEMM(qu8_gemm_4x4__wasm_fmagic)
1409#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
1410
1411
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001412static void qu8_gemm_1x2__scalar_fmagic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001413 GEMMBenchmark(state,
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001414 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_fmagic,
1415 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001416 1, 2, 1, 1);
1417}
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001418static void qu8_gemm_2x2__scalar_fmagic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001419 GEMMBenchmark(state,
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001420 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_fmagic,
1421 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001422 2, 2, 1, 1);
1423}
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001424static void qu8_gemm_3x2__scalar_fmagic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001425 GEMMBenchmark(state,
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001426 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_fmagic,
1427 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001428 3, 2, 1, 1);
1429}
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001430static void qu8_gemm_4x2__scalar_fmagic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001431 GEMMBenchmark(state,
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001432 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_fmagic,
1433 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001434 4, 2, 1, 1);
1435}
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001436static void qu8_gemm_1x4__scalar_fmagic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001437 GEMMBenchmark(state,
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001438 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_fmagic,
1439 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001440 1, 4, 1, 1);
1441}
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001442static void qu8_gemm_2x4__scalar_fmagic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001443 GEMMBenchmark(state,
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001444 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_fmagic,
1445 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001446 2, 4, 1, 1);
1447}
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001448static void qu8_gemm_3x4__scalar_fmagic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001449 GEMMBenchmark(state,
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001450 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_fmagic,
1451 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001452 3, 4, 1, 1);
1453}
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001454static void qu8_gemm_4x4__scalar_fmagic(benchmark::State& state, const char* net) {
Marat Dukhand8e2d712021-07-26 23:35:50 -07001455 GEMMBenchmark(state,
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001456 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_fmagic,
1457 xnn_init_qu8_conv_minmax_fp32_scalar_fmagic_params,
Marat Dukhand8e2d712021-07-26 23:35:50 -07001458 4, 4, 1, 1);
1459}
1460
Marat Dukhan440e8ed2022-01-04 15:30:57 -08001461static void qu8_gemm_1x2__scalar_imagic(benchmark::State& state, const char* net) {
1462 GEMMBenchmark(state,
1463 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_imagic,
1464 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1465 1, 2, 1, 1);
1466}
1467static void qu8_gemm_2x2__scalar_imagic(benchmark::State& state, const char* net) {
1468 GEMMBenchmark(state,
1469 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_imagic,
1470 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1471 2, 2, 1, 1);
1472}
1473static void qu8_gemm_3x2__scalar_imagic(benchmark::State& state, const char* net) {
1474 GEMMBenchmark(state,
1475 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_imagic,
1476 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1477 3, 2, 1, 1);
1478}
1479static void qu8_gemm_4x2__scalar_imagic(benchmark::State& state, const char* net) {
1480 GEMMBenchmark(state,
1481 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_imagic,
1482 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1483 4, 2, 1, 1);
1484}
1485static void qu8_gemm_1x4__scalar_imagic(benchmark::State& state, const char* net) {
1486 GEMMBenchmark(state,
1487 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_imagic,
1488 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1489 1, 4, 1, 1);
1490}
1491static void qu8_gemm_2x4__scalar_imagic(benchmark::State& state, const char* net) {
1492 GEMMBenchmark(state,
1493 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_imagic,
1494 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1495 2, 4, 1, 1);
1496}
1497static void qu8_gemm_3x4__scalar_imagic(benchmark::State& state, const char* net) {
1498 GEMMBenchmark(state,
1499 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_imagic,
1500 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1501 3, 4, 1, 1);
1502}
1503static void qu8_gemm_4x4__scalar_imagic(benchmark::State& state, const char* net) {
1504 GEMMBenchmark(state,
1505 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_imagic,
1506 xnn_init_qu8_conv_minmax_fp32_scalar_imagic_params,
1507 4, 4, 1, 1);
1508}
1509
1510static void qu8_gemm_1x2__scalar_lrintf(benchmark::State& state, const char* net) {
1511 GEMMBenchmark(state,
1512 xnn_qu8_gemm_minmax_fp32_ukernel_1x2__scalar_lrintf,
1513 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1514 1, 2, 1, 1);
1515}
1516static void qu8_gemm_2x2__scalar_lrintf(benchmark::State& state, const char* net) {
1517 GEMMBenchmark(state,
1518 xnn_qu8_gemm_minmax_fp32_ukernel_2x2__scalar_lrintf,
1519 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1520 2, 2, 1, 1);
1521}
1522static void qu8_gemm_3x2__scalar_lrintf(benchmark::State& state, const char* net) {
1523 GEMMBenchmark(state,
1524 xnn_qu8_gemm_minmax_fp32_ukernel_3x2__scalar_lrintf,
1525 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1526 3, 2, 1, 1);
1527}
1528static void qu8_gemm_4x2__scalar_lrintf(benchmark::State& state, const char* net) {
1529 GEMMBenchmark(state,
1530 xnn_qu8_gemm_minmax_fp32_ukernel_4x2__scalar_lrintf,
1531 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1532 4, 2, 1, 1);
1533}
1534static void qu8_gemm_1x4__scalar_lrintf(benchmark::State& state, const char* net) {
1535 GEMMBenchmark(state,
1536 xnn_qu8_gemm_minmax_fp32_ukernel_1x4__scalar_lrintf,
1537 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1538 1, 4, 1, 1);
1539}
1540static void qu8_gemm_2x4__scalar_lrintf(benchmark::State& state, const char* net) {
1541 GEMMBenchmark(state,
1542 xnn_qu8_gemm_minmax_fp32_ukernel_2x4__scalar_lrintf,
1543 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1544 2, 4, 1, 1);
1545}
1546static void qu8_gemm_3x4__scalar_lrintf(benchmark::State& state, const char* net) {
1547 GEMMBenchmark(state,
1548 xnn_qu8_gemm_minmax_fp32_ukernel_3x4__scalar_lrintf,
1549 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1550 3, 4, 1, 1);
1551}
1552static void qu8_gemm_4x4__scalar_lrintf(benchmark::State& state, const char* net) {
1553 GEMMBenchmark(state,
1554 xnn_qu8_gemm_minmax_fp32_ukernel_4x4__scalar_lrintf,
1555 xnn_init_qu8_conv_minmax_fp32_scalar_lrintf_params,
1556 4, 4, 1, 1);
1557}
Marat Dukhand8e2d712021-07-26 23:35:50 -07001558
Marat Dukhan2ac722e2022-01-04 01:54:20 -08001559BENCHMARK_GEMM(qu8_gemm_1x2__scalar_fmagic)
1560BENCHMARK_GEMM(qu8_gemm_2x2__scalar_fmagic)
1561BENCHMARK_GEMM(qu8_gemm_3x2__scalar_fmagic)
1562BENCHMARK_GEMM(qu8_gemm_4x2__scalar_fmagic)
1563BENCHMARK_GEMM(qu8_gemm_1x4__scalar_fmagic)
1564BENCHMARK_GEMM(qu8_gemm_2x4__scalar_fmagic)
1565BENCHMARK_GEMM(qu8_gemm_3x4__scalar_fmagic)
1566BENCHMARK_GEMM(qu8_gemm_4x4__scalar_fmagic)
Marat Dukhand8e2d712021-07-26 23:35:50 -07001567
Marat Dukhan440e8ed2022-01-04 15:30:57 -08001568BENCHMARK_GEMM(qu8_gemm_1x2__scalar_imagic)
1569BENCHMARK_GEMM(qu8_gemm_2x2__scalar_imagic)
1570BENCHMARK_GEMM(qu8_gemm_3x2__scalar_imagic)
1571BENCHMARK_GEMM(qu8_gemm_4x2__scalar_imagic)
1572BENCHMARK_GEMM(qu8_gemm_1x4__scalar_imagic)
1573BENCHMARK_GEMM(qu8_gemm_2x4__scalar_imagic)
1574BENCHMARK_GEMM(qu8_gemm_3x4__scalar_imagic)
1575BENCHMARK_GEMM(qu8_gemm_4x4__scalar_imagic)
1576
1577BENCHMARK_GEMM(qu8_gemm_1x2__scalar_lrintf)
1578BENCHMARK_GEMM(qu8_gemm_2x2__scalar_lrintf)
1579BENCHMARK_GEMM(qu8_gemm_3x2__scalar_lrintf)
1580BENCHMARK_GEMM(qu8_gemm_4x2__scalar_lrintf)
1581BENCHMARK_GEMM(qu8_gemm_1x4__scalar_lrintf)
1582BENCHMARK_GEMM(qu8_gemm_2x4__scalar_lrintf)
1583BENCHMARK_GEMM(qu8_gemm_3x4__scalar_lrintf)
1584BENCHMARK_GEMM(qu8_gemm_4x4__scalar_lrintf)
1585
Marat Dukhand8e2d712021-07-26 23:35:50 -07001586
Marat Dukhan33f0c7a2019-10-01 13:33:08 -07001587#ifdef BENCHMARK_RUY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001588BENCHMARK_GEMM(ruy_st)
Marat Dukhan33f0c7a2019-10-01 13:33:08 -07001589#endif // BENCHMARK_RUY
1590#ifdef BENCHMARK_GEMMLOWP
1591BENCHMARK_GEMM(gemmlowp_st)
1592#endif // BENCHMARK_GEMMLOWP
XNNPACK Teamb455b122019-09-27 18:10:33 -07001593
1594#ifndef XNNPACK_BENCHMARK_NO_MAIN
1595BENCHMARK_MAIN();
1596#endif