blob: a142295116cfce74e3760aa3e793c8ff0abc7e48 [file] [log] [blame]
Marat Dukhanbdb56f52020-02-05 21:42:49 -08001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cfloat>
8#include <cmath>
9#include <functional>
10#include <random>
11#include <vector>
12
13#include <cpuinfo.h>
14
15#include <benchmark/benchmark.h>
16#include <fp16/fp16.h>
Marat Dukhan1631e3e2020-12-06 19:29:31 -080017#include "bench/spmm.h"
Marat Dukhanbdb56f52020-02-05 21:42:49 -080018#include "bench/utils.h"
19#include <xnnpack/AlignedAllocator.h>
20#include <xnnpack/common.h>
21#include <xnnpack/params-init.h>
22#include <xnnpack/params.h>
23#include <xnnpack/spmm.h>
24
25
26static void SpMMBenchmark(benchmark::State& state,
Marat Dukhanc4302c22022-01-06 19:27:03 -080027 xnn_f16_spmm_minmax_ukernel_function spmm, uint32_t mr, uint32_t nr, float sparsity,
28 xnn_init_f16_scaleminmax_params_fn init_params,
29 benchmark::utils::IsaCheckFunction isa_check = nullptr)
Marat Dukhanbdb56f52020-02-05 21:42:49 -080030{
31 if (!cpuinfo_initialize()) {
32 state.SkipWithError("cpuinfo initialization failed");
33 return;
34 }
Frank Barchard40f50e12020-05-29 22:21:56 -070035 if (!benchmark::utils::CheckNEONFP16ARITH(state)) {
36 return;
37 }
Marat Dukhanbdb56f52020-02-05 21:42:49 -080038
39 const size_t mc = state.range(0);
40 const size_t nc = state.range(1);
41 const size_t kc = state.range(2);
42
43 std::random_device random_device;
44 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -070045 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
Marat Dukhanbdb56f52020-02-05 21:42:49 -080046 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
47
48 // if using blocks, generate the reduced matrix first and then extrude along
49 // the block dimension (n), to get the full matrix
50 size_t ncols = nc / nr + nc % nr;
51 std::vector<uint16_t> b(ncols * kc);
52 std::vector<uint16_t> bias(nc);
53 std::vector<uint16_t> w;
54 std::vector<uint32_t> nmap;
55 std::vector<int32_t> dmap;
56 const size_t sparse_end = std::min(size_t(float(b.size()) * sparsity), b.size());
57 const size_t num_nonzeroes = nr * (b.size() - sparse_end);
58
59 const size_t w_elements = num_nonzeroes + nc;
60 const size_t c_elements = mc * nc;
61 const size_t dmap_elements = num_nonzeroes / nr;
62 const size_t nmap_elements = nc;
63 const size_t num_buffers = 1 +
64 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
65 sizeof(uint16_t) * (w_elements + c_elements) + sizeof(uint32_t) * (dmap_elements + nmap_elements));
66
67 // Micro-kernel can access one element beyond w and dmap for software pipelining.
68 w.reserve(num_buffers * w_elements + 1);
69 dmap.reserve(num_buffers * dmap_elements + 1);
70 nmap.resize(num_buffers * nmap_elements);
71
72 std::vector<size_t> a_offsets(num_buffers);
73
74 for (size_t buffer_index = 0; buffer_index < num_buffers; buffer_index++) {
75 // Re-generate weights. Note: each re-generation produces the number of non-zeroes.
76 std::fill(b.begin(), b.begin() + sparse_end, 0);
77 std::generate(b.begin() + sparse_end, b.end(), std::ref(f16rng));
78 std::shuffle(b.begin(), b.end(), rng);
79 std::generate(bias.begin(), bias.end(), std::ref(f16rng));
80
81 uint32_t first_j = 0, last_j = 0;
82 bool is_first_nonzero = true;
83 for (uint32_t i = 0; i < nc / nr; i++) {
84 for (uint32_t n = 0; n < nr; n++)
85 w.push_back(bias[nr * i + n]);
86 for (uint32_t j = 0; j < kc; j++) {
87 if ((b[i * kc + j] & 0x7FFF) != 0) {
88 for (size_t l = 0; l < nr; l++)
89 w.push_back(fp16_ieee_from_fp32_value(fp16_ieee_to_fp32_value(b[i * kc + j]) + static_cast<float>(i)));
90 if (is_first_nonzero) {
91 first_j = j;
92 } else {
93 const ptrdiff_t increment = int32_t(j - last_j) * int32_t(mc) * int32_t(sizeof(uint16_t));
94 dmap.push_back(increment);
95 }
96 last_j = j;
97 is_first_nonzero = false;
98 nmap[buffer_index * nmap_elements + i] += 1;
99 }
100 }
101 }
102 for (uint32_t i = nc / nr; i < ncols; i++) {
103 w.push_back(bias[i]);
104 for (uint32_t j = 0; j < kc; j++) {
105 if ((b[i * kc + j] & 0x7FFF) != 0) {
106 w.push_back(b[i * kc + j]);
107 if (is_first_nonzero) {
108 first_j = j;
109 } else {
110 const ptrdiff_t increment = int32_t(j - last_j) * int32_t(mc) * int32_t(sizeof(uint16_t));
111 dmap.push_back(increment);
112 }
113 last_j = j;
114 is_first_nonzero = false;
115 nmap[buffer_index * nmap_elements + i] += 1;
116 }
117 }
118 }
119 {
120 const ptrdiff_t increment = int32_t(first_j - last_j) * int32_t(mc) * int32_t(sizeof(uint16_t));
121 dmap.push_back(increment);
122 }
123
124 a_offsets[buffer_index] = first_j * mc;
125 }
126
127 // Micro-kernel can access one element beyond w and dmap for software pipelining.
128 w.resize(w.size() + 1);
129 dmap.resize(dmap.size() + 1);
130
131 std::vector<float, AlignedAllocator<float, 64>> a(kc * mc);
132 std::vector<float, AlignedAllocator<float, 64>> c(num_buffers * c_elements);
133
134 std::generate(a.begin(), a.end(), std::ref(f32rng));
135 std::fill(c.begin(), c.end(), nanf(""));
136
Marat Dukhanc4302c22022-01-06 19:27:03 -0800137 xnn_f16_scaleminmax_params params;
138 init_params(&params, 0x3C00 /* 1.0 */, 0x7C00 /* inf */, 0xFC00 /* -inf */);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800139
140 size_t buffer_index = 0;
141 for (auto _ : state) {
142 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
143 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
144 // - W, Kmap, and Nmap is not in cache (for any cache level)
145 // - C is not in cache (for any cache level)
146 state.PauseTiming();
147 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
148 buffer_index = (buffer_index + 1) % num_buffers;
149 state.ResumeTiming();
150
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800151 spmm(mc * sizeof(uint16_t), nc,
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800152 a.data() + a_offsets[buffer_index],
153 w.data() + buffer_index * w_elements,
154 dmap.data() + buffer_index * dmap_elements,
155 nmap.data() + buffer_index * nmap_elements,
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800156 c.data() + buffer_index * c_elements, mc * sizeof(uint16_t),
Frank Barchard77acbf22020-05-01 10:08:26 -0700157 &params);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800158 }
159
Marat Dukhand713e8a2020-12-04 14:23:12 -0800160 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
161 if (cpu_frequency != 0) {
162 state.counters["cpufreq"] = cpu_frequency;
163 }
164
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800165 state.counters["FLOPS"] = benchmark::Counter(
166 uint64_t(state.iterations()) * 2 * mc * num_nonzeroes, benchmark::Counter::kIsRate);
167
168 state.counters["EffFLOPS"] = benchmark::Counter(
169 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
170}
171
172
173#if XNN_ARCH_ARM64
174 static void spmm80_8x1__neonfp16arith(benchmark::State& state, const char* net) {
Marat Dukhanc4302c22022-01-06 19:27:03 -0800175 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith, 8, 1, 0.8f,
176 xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800177 }
Frank Barchardbeca6522020-10-30 22:34:35 -0700178 static void spmm80_8x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
Marat Dukhanc4302c22022-01-06 19:27:03 -0800179 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith_x2, 8, 1, 0.8f,
180 xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800181 }
182 static void spmm80_16x1__neonfp16arith(benchmark::State& state, const char* net) {
Marat Dukhanc4302c22022-01-06 19:27:03 -0800183 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith, 16, 1, 0.8f,
184 xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800185 }
Frank Barchardbeca6522020-10-30 22:34:35 -0700186 static void spmm80_16x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
Marat Dukhanc4302c22022-01-06 19:27:03 -0800187 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith_x2, 16, 1, 0.8f,
188 xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800189 }
190 static void spmm80_24x1__neonfp16arith(benchmark::State& state, const char* net) {
Marat Dukhanc4302c22022-01-06 19:27:03 -0800191 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith, 24, 1, 0.8f,
192 xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800193 }
Frank Barchardbeca6522020-10-30 22:34:35 -0700194 static void spmm80_24x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
Marat Dukhanc4302c22022-01-06 19:27:03 -0800195 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith_x2, 24, 1, 0.8f,
196 xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800197 }
198 static void spmm80_32x1__neonfp16arith(benchmark::State& state, const char* net) {
Marat Dukhanc4302c22022-01-06 19:27:03 -0800199 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith, 32, 1, 0.8f,
200 xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800201 }
Frank Barchardbeca6522020-10-30 22:34:35 -0700202 static void spmm80_32x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
Marat Dukhanc4302c22022-01-06 19:27:03 -0800203 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith_x2, 32, 1, 0.8f,
204 xnn_init_f16_scaleminmax_neon_params, benchmark::utils::CheckNEONFP16ARITH);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800205 }
206
Marat Dukhan1631e3e2020-12-06 19:29:31 -0800207 BENCHMARK_SPMM(spmm80_8x1__neonfp16arith)
208 BENCHMARK_SPMM(spmm80_8x1__neonfp16arith_x2)
209 BENCHMARK_SPMM(spmm80_16x1__neonfp16arith)
210 BENCHMARK_SPMM(spmm80_16x1__neonfp16arith_x2)
211 BENCHMARK_SPMM(spmm80_24x1__neonfp16arith)
212 BENCHMARK_SPMM(spmm80_24x1__neonfp16arith_x2)
213 BENCHMARK_SPMM(spmm80_32x1__neonfp16arith)
214 BENCHMARK_SPMM(spmm80_32x1__neonfp16arith_x2)
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800215#endif // XNN_ARCH_ARM64
216
217#ifndef XNNPACK_BENCHMARK_NO_MAIN
218BENCHMARK_MAIN();
219#endif