blob: 8c9464ceab8eb46291da5266e1d409175f29f881 [file] [log] [blame]
Marat Dukhanbdb56f52020-02-05 21:42:49 -08001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cfloat>
8#include <cmath>
9#include <functional>
10#include <random>
11#include <vector>
12
13#include <cpuinfo.h>
14
15#include <benchmark/benchmark.h>
16#include <fp16/fp16.h>
Marat Dukhan1631e3e2020-12-06 19:29:31 -080017#include "bench/spmm.h"
Marat Dukhanbdb56f52020-02-05 21:42:49 -080018#include "bench/utils.h"
19#include <xnnpack/AlignedAllocator.h>
20#include <xnnpack/common.h>
21#include <xnnpack/params-init.h>
22#include <xnnpack/params.h>
23#include <xnnpack/spmm.h>
24
25
26static void SpMMBenchmark(benchmark::State& state,
Marat Dukhan355ab432020-04-09 19:01:52 -070027 xnn_f16_spmm_minmax_ukernel_function spmm, uint32_t mr, uint32_t nr, float sparsity)
Marat Dukhanbdb56f52020-02-05 21:42:49 -080028{
29 if (!cpuinfo_initialize()) {
30 state.SkipWithError("cpuinfo initialization failed");
31 return;
32 }
Frank Barchard40f50e12020-05-29 22:21:56 -070033 if (!benchmark::utils::CheckNEONFP16ARITH(state)) {
34 return;
35 }
Marat Dukhanbdb56f52020-02-05 21:42:49 -080036
37 const size_t mc = state.range(0);
38 const size_t nc = state.range(1);
39 const size_t kc = state.range(2);
40
41 std::random_device random_device;
42 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -070043 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
Marat Dukhanbdb56f52020-02-05 21:42:49 -080044 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
45
46 // if using blocks, generate the reduced matrix first and then extrude along
47 // the block dimension (n), to get the full matrix
48 size_t ncols = nc / nr + nc % nr;
49 std::vector<uint16_t> b(ncols * kc);
50 std::vector<uint16_t> bias(nc);
51 std::vector<uint16_t> w;
52 std::vector<uint32_t> nmap;
53 std::vector<int32_t> dmap;
54 const size_t sparse_end = std::min(size_t(float(b.size()) * sparsity), b.size());
55 const size_t num_nonzeroes = nr * (b.size() - sparse_end);
56
57 const size_t w_elements = num_nonzeroes + nc;
58 const size_t c_elements = mc * nc;
59 const size_t dmap_elements = num_nonzeroes / nr;
60 const size_t nmap_elements = nc;
61 const size_t num_buffers = 1 +
62 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
63 sizeof(uint16_t) * (w_elements + c_elements) + sizeof(uint32_t) * (dmap_elements + nmap_elements));
64
65 // Micro-kernel can access one element beyond w and dmap for software pipelining.
66 w.reserve(num_buffers * w_elements + 1);
67 dmap.reserve(num_buffers * dmap_elements + 1);
68 nmap.resize(num_buffers * nmap_elements);
69
70 std::vector<size_t> a_offsets(num_buffers);
71
72 for (size_t buffer_index = 0; buffer_index < num_buffers; buffer_index++) {
73 // Re-generate weights. Note: each re-generation produces the number of non-zeroes.
74 std::fill(b.begin(), b.begin() + sparse_end, 0);
75 std::generate(b.begin() + sparse_end, b.end(), std::ref(f16rng));
76 std::shuffle(b.begin(), b.end(), rng);
77 std::generate(bias.begin(), bias.end(), std::ref(f16rng));
78
79 uint32_t first_j = 0, last_j = 0;
80 bool is_first_nonzero = true;
81 for (uint32_t i = 0; i < nc / nr; i++) {
82 for (uint32_t n = 0; n < nr; n++)
83 w.push_back(bias[nr * i + n]);
84 for (uint32_t j = 0; j < kc; j++) {
85 if ((b[i * kc + j] & 0x7FFF) != 0) {
86 for (size_t l = 0; l < nr; l++)
87 w.push_back(fp16_ieee_from_fp32_value(fp16_ieee_to_fp32_value(b[i * kc + j]) + static_cast<float>(i)));
88 if (is_first_nonzero) {
89 first_j = j;
90 } else {
91 const ptrdiff_t increment = int32_t(j - last_j) * int32_t(mc) * int32_t(sizeof(uint16_t));
92 dmap.push_back(increment);
93 }
94 last_j = j;
95 is_first_nonzero = false;
96 nmap[buffer_index * nmap_elements + i] += 1;
97 }
98 }
99 }
100 for (uint32_t i = nc / nr; i < ncols; i++) {
101 w.push_back(bias[i]);
102 for (uint32_t j = 0; j < kc; j++) {
103 if ((b[i * kc + j] & 0x7FFF) != 0) {
104 w.push_back(b[i * kc + j]);
105 if (is_first_nonzero) {
106 first_j = j;
107 } else {
108 const ptrdiff_t increment = int32_t(j - last_j) * int32_t(mc) * int32_t(sizeof(uint16_t));
109 dmap.push_back(increment);
110 }
111 last_j = j;
112 is_first_nonzero = false;
113 nmap[buffer_index * nmap_elements + i] += 1;
114 }
115 }
116 }
117 {
118 const ptrdiff_t increment = int32_t(first_j - last_j) * int32_t(mc) * int32_t(sizeof(uint16_t));
119 dmap.push_back(increment);
120 }
121
122 a_offsets[buffer_index] = first_j * mc;
123 }
124
125 // Micro-kernel can access one element beyond w and dmap for software pipelining.
126 w.resize(w.size() + 1);
127 dmap.resize(dmap.size() + 1);
128
129 std::vector<float, AlignedAllocator<float, 64>> a(kc * mc);
130 std::vector<float, AlignedAllocator<float, 64>> c(num_buffers * c_elements);
131
132 std::generate(a.begin(), a.end(), std::ref(f32rng));
133 std::fill(c.begin(), c.end(), nanf(""));
134
Frank Barchard77acbf22020-05-01 10:08:26 -0700135 xnn_f16_scaleminmax_params params{
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800136 0x3C00 /* 1.0 */, 0x7C00 /* inf */, 0xFC00 /* -inf */};
137
138 size_t buffer_index = 0;
139 for (auto _ : state) {
140 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
141 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
142 // - W, Kmap, and Nmap is not in cache (for any cache level)
143 // - C is not in cache (for any cache level)
144 state.PauseTiming();
145 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
146 buffer_index = (buffer_index + 1) % num_buffers;
147 state.ResumeTiming();
148
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800149 spmm(mc * sizeof(uint16_t), nc,
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800150 a.data() + a_offsets[buffer_index],
151 w.data() + buffer_index * w_elements,
152 dmap.data() + buffer_index * dmap_elements,
153 nmap.data() + buffer_index * nmap_elements,
Marat Dukhane8bfcc82020-11-16 12:28:13 -0800154 c.data() + buffer_index * c_elements, mc * sizeof(uint16_t),
Frank Barchard77acbf22020-05-01 10:08:26 -0700155 &params);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800156 }
157
Marat Dukhand713e8a2020-12-04 14:23:12 -0800158 const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
159 if (cpu_frequency != 0) {
160 state.counters["cpufreq"] = cpu_frequency;
161 }
162
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800163 state.counters["FLOPS"] = benchmark::Counter(
164 uint64_t(state.iterations()) * 2 * mc * num_nonzeroes, benchmark::Counter::kIsRate);
165
166 state.counters["EffFLOPS"] = benchmark::Counter(
167 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
168}
169
170
171#if XNN_ARCH_ARM64
172 static void spmm80_8x1__neonfp16arith(benchmark::State& state, const char* net) {
Marat Dukhan355ab432020-04-09 19:01:52 -0700173 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith, 8, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800174 }
Frank Barchardbeca6522020-10-30 22:34:35 -0700175 static void spmm80_8x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
176 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith_x2, 8, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800177 }
178 static void spmm80_16x1__neonfp16arith(benchmark::State& state, const char* net) {
Marat Dukhan355ab432020-04-09 19:01:52 -0700179 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith, 16, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800180 }
Frank Barchardbeca6522020-10-30 22:34:35 -0700181 static void spmm80_16x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
182 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith_x2, 16, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800183 }
184 static void spmm80_24x1__neonfp16arith(benchmark::State& state, const char* net) {
Marat Dukhan355ab432020-04-09 19:01:52 -0700185 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith, 24, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800186 }
Frank Barchardbeca6522020-10-30 22:34:35 -0700187 static void spmm80_24x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
188 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith_x2, 24, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800189 }
190 static void spmm80_32x1__neonfp16arith(benchmark::State& state, const char* net) {
Marat Dukhan355ab432020-04-09 19:01:52 -0700191 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith, 32, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800192 }
Frank Barchardbeca6522020-10-30 22:34:35 -0700193 static void spmm80_32x1__neonfp16arith_x2(benchmark::State& state, const char* net) {
194 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith_x2, 32, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800195 }
196
Marat Dukhan1631e3e2020-12-06 19:29:31 -0800197 BENCHMARK_SPMM(spmm80_8x1__neonfp16arith)
198 BENCHMARK_SPMM(spmm80_8x1__neonfp16arith_x2)
199 BENCHMARK_SPMM(spmm80_16x1__neonfp16arith)
200 BENCHMARK_SPMM(spmm80_16x1__neonfp16arith_x2)
201 BENCHMARK_SPMM(spmm80_24x1__neonfp16arith)
202 BENCHMARK_SPMM(spmm80_24x1__neonfp16arith_x2)
203 BENCHMARK_SPMM(spmm80_32x1__neonfp16arith)
204 BENCHMARK_SPMM(spmm80_32x1__neonfp16arith_x2)
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800205#endif // XNN_ARCH_ARM64
206
207#ifndef XNNPACK_BENCHMARK_NO_MAIN
208BENCHMARK_MAIN();
209#endif