blob: 07f0f1d085daa1082d4ca63c3f57bf441cecb523 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <algorithm>
10#include <cfloat>
11#include <cmath>
12#include <functional>
13#include <random>
14#include <vector>
15
16#include <cpuinfo.h>
17
Frank Barchardbb4c18b2019-09-30 11:05:52 -070018#include <benchmark/benchmark.h>
Marat Dukhan629a33e2019-10-01 10:39:14 -070019#include <fp16/fp16.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070020#include "bench/gemm.h"
Frank Barchardbb4c18b2019-09-30 11:05:52 -070021#include "bench/utils.h"
XNNPACK Teamb455b122019-09-27 18:10:33 -070022#include <xnnpack/AlignedAllocator.h>
Marat Dukhan1dadbf72019-10-01 10:46:20 -070023#include <xnnpack/common.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include <xnnpack/gemm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070025#include <xnnpack/pack.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070026#include <xnnpack/params-init.h>
Frank Barcharde0601b52019-10-25 17:43:34 -070027#include <xnnpack/params.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070028
XNNPACK Teamb455b122019-09-27 18:10:33 -070029
30static void GEMMBenchmark(benchmark::State& state,
Frank Barcharde92f8592020-05-04 10:08:44 -070031 xnn_f16_gemm_minmax_ukernel_function gemm,
Frank Barchard142268b2020-04-29 16:37:18 -070032 size_t mr, size_t nr, size_t kr, size_t sr)
XNNPACK Teamb455b122019-09-27 18:10:33 -070033{
34 if (!cpuinfo_initialize()) {
35 state.SkipWithError("cpuinfo initialization failed");
36 return;
37 }
Frank Barchard40f50e12020-05-29 22:21:56 -070038 if (!benchmark::utils::CheckNEONFP16ARITH(state)) {
39 return;
40 }
XNNPACK Teamb455b122019-09-27 18:10:33 -070041
42 const size_t mc = state.range(0);
43 const size_t nc = state.range(1);
44 const size_t kc = state.range(2);
45
Marat Dukhan42323232019-10-23 02:09:02 -070046 const size_t nc_stride = benchmark::utils::RoundUp(nc, nr);
47 const size_t kc_stride = benchmark::utils::RoundUp(kc, kr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070048
49 std::random_device random_device;
50 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -070051 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -070052 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
53
54 std::vector<uint16_t> a(mc * kc);
55 std::generate(a.begin(), a.end(), std::ref(f16rng));
56 std::vector<uint16_t> k(nc * kc);
57 std::generate(k.begin(), k.end(), std::ref(f16rng));
58 std::vector<uint16_t> b(nc);
59 std::generate(b.begin(), b.end(), std::ref(f16rng));
60
61 const size_t w_elements = nc_stride * kc_stride + nc_stride;
62 const size_t c_elements = mc * nc;
63 const size_t num_buffers = 1 +
Marat Dukhan42323232019-10-23 02:09:02 -070064 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 sizeof(uint16_t) * (w_elements + c_elements));
66
67 std::vector<uint16_t, AlignedAllocator<uint16_t, 32>> w(w_elements * num_buffers);
68 std::fill(w.begin(), w.end(), 0);
Marat Dukhanb42f8662020-07-06 20:46:13 -070069 xnn_pack_f16_gemm_goi_w(1 /* groups */, nc, kc, nr, kr, sr, k.data(), b.data(), w.data(), nullptr);
XNNPACK Teamb455b122019-09-27 18:10:33 -070070 std::vector<uint16_t> c(c_elements * num_buffers);
71 std::fill(c.begin(), c.end(), UINT16_C(0x7E00) /* NaN */);
72
Frank Barchard40d20fe2020-05-05 00:37:45 -070073 // Prepare minmax parameters.
74 xnn_f16_scaleminmax_params params;
75 params = xnn_init_f16_scaleminmax_params(
Marat Dukhanef4ce312020-09-10 12:29:08 -070076 UINT16_C(0x3C00) /* 1.0 */, UINT16_C(0x7C00) /* inf */, UINT16_C(0xFC00) /* -inf */);
XNNPACK Teamb455b122019-09-27 18:10:33 -070077
78 size_t buffer_index = 0;
79 for (auto _ : state) {
80 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
81 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
82 // - W is not in cache (for any cache level)
83 // - C is not in cache (for any cache level)
84 state.PauseTiming();
Marat Dukhan42323232019-10-23 02:09:02 -070085 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
XNNPACK Teamb455b122019-09-27 18:10:33 -070086 buffer_index = (buffer_index + 1) % num_buffers;
87 state.ResumeTiming();
88
89 for (uint32_t m = 0; m < mc; m += mr) {
90 const uint32_t mb = min(mc - m, mr);
91 for (uint32_t n = 0; n < nc; n += nr) {
92 const uint32_t nb = min(nc - n, nr);
Frank Barcharde92f8592020-05-04 10:08:44 -070093 gemm(
Marat Dukhanb1864632019-11-25 16:34:17 -080094 mb, nb, kc * sizeof(uint16_t),
XNNPACK Teamb455b122019-09-27 18:10:33 -070095 a.data() + m * kc, kc * sizeof(uint16_t),
96 w.data() + (nc_stride * buffer_index + n) * (kc_stride + 1),
97 c.data() + (mc * buffer_index + m) * nc + n, nc * sizeof(uint16_t), nr * sizeof(uint16_t),
Frank Barchard77acbf22020-05-01 10:08:26 -070098 &params);
XNNPACK Teamb455b122019-09-27 18:10:33 -070099 }
100 }
101 }
102
Marat Dukhand713e8a2020-12-04 14:23:12 -0800103 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
104 if (cpu_frequency != 0) {
105 state.counters["cpufreq"] = cpu_frequency;
106 }
107
XNNPACK Teamb455b122019-09-27 18:10:33 -0700108 state.counters["FLOPS"] = benchmark::Counter(
109 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
110}
111
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700112#if XNN_ARCH_ARM64
Frank Barchard1f4e4612020-04-13 18:24:54 -0700113 static void f16_gemm_1x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
Frank Barchard142268b2020-04-29 16:37:18 -0700114 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64, 1, 8, 1, 1);
Frank Barchard1f4e4612020-04-13 18:24:54 -0700115 }
116
Frank Barchard54a9d9d2019-11-25 16:46:38 -0800117 static void f16_gemm_4x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
Frank Barchard142268b2020-04-29 16:37:18 -0700118 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64, 4, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700119 }
120
Frank Barchard54a9d9d2019-11-25 16:46:38 -0800121 static void f16_gemm_6x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
Frank Barchard142268b2020-04-29 16:37:18 -0700122 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64, 6, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700123 }
124
Frank Barchard54a9d9d2019-11-25 16:46:38 -0800125 static void f16_gemm_8x8__neonfp16arith_ld64(benchmark::State& state, const char* net) {
Frank Barchard142268b2020-04-29 16:37:18 -0700126 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64, 8, 8, 1, 1);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700127 }
128
Frank Barchard3f9f99f2020-05-06 01:12:04 -0700129 static void f16_gemm_1x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
130 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64, 1, 16, 1, 1);
131 }
132
133 static void f16_gemm_4x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
134 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64, 4, 16, 1, 1);
135 }
136
137 static void f16_gemm_6x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
138 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64, 6, 16, 1, 1);
139 }
140
141 static void f16_gemm_8x16__neonfp16arith_ld64(benchmark::State& state, const char* net) {
142 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64, 8, 16, 1, 1);
143 }
144
Frank Barchard1f4e4612020-04-13 18:24:54 -0700145 BENCHMARK_GEMM(f16_gemm_1x8__neonfp16arith_ld64)
Frank Barchard54a9d9d2019-11-25 16:46:38 -0800146 BENCHMARK_GEMM(f16_gemm_4x8__neonfp16arith_ld64)
147 BENCHMARK_GEMM(f16_gemm_6x8__neonfp16arith_ld64)
148 BENCHMARK_GEMM(f16_gemm_8x8__neonfp16arith_ld64)
Frank Barchard3f9f99f2020-05-06 01:12:04 -0700149 BENCHMARK_GEMM(f16_gemm_1x16__neonfp16arith_ld64)
150 BENCHMARK_GEMM(f16_gemm_4x16__neonfp16arith_ld64)
151 BENCHMARK_GEMM(f16_gemm_6x16__neonfp16arith_ld64)
152 BENCHMARK_GEMM(f16_gemm_8x16__neonfp16arith_ld64)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700153#endif
154
Frank Barchard683f5592020-04-10 00:48:26 -0700155#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard36b76b62020-04-10 12:39:17 -0700156 static void f16_gemm_1x16__aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
Frank Barchard142268b2020-04-29 16:37:18 -0700157 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32, 1, 16, 1, 1);
Frank Barchard36b76b62020-04-10 12:39:17 -0700158 }
159
Frank Barchard683f5592020-04-10 00:48:26 -0700160 static void f16_gemm_4x16__aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
Frank Barchard142268b2020-04-29 16:37:18 -0700161 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32, 4, 16, 1, 1);
Frank Barchard683f5592020-04-10 00:48:26 -0700162 }
163
164 static void f16_gemm_6x16__aarch64_neonfp16arith_ld32(benchmark::State& state, const char* net) {
Frank Barchard142268b2020-04-29 16:37:18 -0700165 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32, 6, 16, 1, 1);
Frank Barchard683f5592020-04-10 00:48:26 -0700166 }
167
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700168 static void f16_gemm_1x8__aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
Frank Barchard142268b2020-04-29 16:37:18 -0700169 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64, 1, 8, 1, 1);
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700170 }
171
172 static void f16_gemm_4x8__aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
Frank Barchard142268b2020-04-29 16:37:18 -0700173 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64, 4, 8, 1, 1);
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700174 }
175
176 static void f16_gemm_6x8__aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
Frank Barchard142268b2020-04-29 16:37:18 -0700177 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64, 6, 8, 1, 1);
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700178 }
179
Frank Barchard3b8e5662020-04-20 12:12:53 -0700180 static void f16_gemm_8x8__aarch64_neonfp16arith_ld64(benchmark::State& state, const char* net) {
Frank Barchard142268b2020-04-29 16:37:18 -0700181 GEMMBenchmark(state, xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64, 8, 8, 1, 1);
Frank Barchard3b8e5662020-04-20 12:12:53 -0700182 }
183
Frank Barchard36b76b62020-04-10 12:39:17 -0700184 BENCHMARK_GEMM(f16_gemm_1x16__aarch64_neonfp16arith_ld32)
Frank Barchard683f5592020-04-10 00:48:26 -0700185 BENCHMARK_GEMM(f16_gemm_4x16__aarch64_neonfp16arith_ld32)
186 BENCHMARK_GEMM(f16_gemm_6x16__aarch64_neonfp16arith_ld32)
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700187 BENCHMARK_GEMM(f16_gemm_1x8__aarch64_neonfp16arith_ld64)
188 BENCHMARK_GEMM(f16_gemm_4x8__aarch64_neonfp16arith_ld64)
189 BENCHMARK_GEMM(f16_gemm_6x8__aarch64_neonfp16arith_ld64)
Frank Barchard3b8e5662020-04-20 12:12:53 -0700190 BENCHMARK_GEMM(f16_gemm_8x8__aarch64_neonfp16arith_ld64)
Frank Barchard683f5592020-04-10 00:48:26 -0700191#endif // XNN_ARCH_ARM64
Frank Barchardbddfbcd2020-04-15 12:32:41 -0700192
XNNPACK Teamb455b122019-09-27 18:10:33 -0700193#ifndef XNNPACK_BENCHMARK_NO_MAIN
194BENCHMARK_MAIN();
195#endif