blob: 248fd33135a6bb3be334df0b086f07d2ede224e3 [file] [log] [blame]
Marat Dukhanbdb56f52020-02-05 21:42:49 -08001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <algorithm>
7#include <cfloat>
8#include <cmath>
9#include <functional>
10#include <random>
11#include <vector>
12
13#include <cpuinfo.h>
14
15#include <benchmark/benchmark.h>
16#include <fp16/fp16.h>
17#include "bench/gemm.h"
18#include "bench/utils.h"
19#include <xnnpack/AlignedAllocator.h>
20#include <xnnpack/common.h>
21#include <xnnpack/params-init.h>
22#include <xnnpack/params.h>
23#include <xnnpack/spmm.h>
24
25
26static void SpMMBenchmark(benchmark::State& state,
Marat Dukhan355ab432020-04-09 19:01:52 -070027 xnn_f16_spmm_minmax_ukernel_function spmm, uint32_t mr, uint32_t nr, float sparsity)
Marat Dukhanbdb56f52020-02-05 21:42:49 -080028{
29 if (!cpuinfo_initialize()) {
30 state.SkipWithError("cpuinfo initialization failed");
31 return;
32 }
Frank Barchard40f50e12020-05-29 22:21:56 -070033 if (!benchmark::utils::CheckNEONFP16ARITH(state)) {
34 return;
35 }
Marat Dukhanbdb56f52020-02-05 21:42:49 -080036
37 const size_t mc = state.range(0);
38 const size_t nc = state.range(1);
39 const size_t kc = state.range(2);
40
41 std::random_device random_device;
42 auto rng = std::mt19937(random_device());
Marat Dukhan44f0ca72020-08-02 21:46:58 -070043 auto f32rng = std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
Marat Dukhanbdb56f52020-02-05 21:42:49 -080044 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
45
46 // if using blocks, generate the reduced matrix first and then extrude along
47 // the block dimension (n), to get the full matrix
48 size_t ncols = nc / nr + nc % nr;
49 std::vector<uint16_t> b(ncols * kc);
50 std::vector<uint16_t> bias(nc);
51 std::vector<uint16_t> w;
52 std::vector<uint32_t> nmap;
53 std::vector<int32_t> dmap;
54 const size_t sparse_end = std::min(size_t(float(b.size()) * sparsity), b.size());
55 const size_t num_nonzeroes = nr * (b.size() - sparse_end);
56
57 const size_t w_elements = num_nonzeroes + nc;
58 const size_t c_elements = mc * nc;
59 const size_t dmap_elements = num_nonzeroes / nr;
60 const size_t nmap_elements = nc;
61 const size_t num_buffers = 1 +
62 benchmark::utils::DivideRoundUp<size_t>(benchmark::utils::GetMaxCacheSize(),
63 sizeof(uint16_t) * (w_elements + c_elements) + sizeof(uint32_t) * (dmap_elements + nmap_elements));
64
65 // Micro-kernel can access one element beyond w and dmap for software pipelining.
66 w.reserve(num_buffers * w_elements + 1);
67 dmap.reserve(num_buffers * dmap_elements + 1);
68 nmap.resize(num_buffers * nmap_elements);
69
70 std::vector<size_t> a_offsets(num_buffers);
71
72 for (size_t buffer_index = 0; buffer_index < num_buffers; buffer_index++) {
73 // Re-generate weights. Note: each re-generation produces the number of non-zeroes.
74 std::fill(b.begin(), b.begin() + sparse_end, 0);
75 std::generate(b.begin() + sparse_end, b.end(), std::ref(f16rng));
76 std::shuffle(b.begin(), b.end(), rng);
77 std::generate(bias.begin(), bias.end(), std::ref(f16rng));
78
79 uint32_t first_j = 0, last_j = 0;
80 bool is_first_nonzero = true;
81 for (uint32_t i = 0; i < nc / nr; i++) {
82 for (uint32_t n = 0; n < nr; n++)
83 w.push_back(bias[nr * i + n]);
84 for (uint32_t j = 0; j < kc; j++) {
85 if ((b[i * kc + j] & 0x7FFF) != 0) {
86 for (size_t l = 0; l < nr; l++)
87 w.push_back(fp16_ieee_from_fp32_value(fp16_ieee_to_fp32_value(b[i * kc + j]) + static_cast<float>(i)));
88 if (is_first_nonzero) {
89 first_j = j;
90 } else {
91 const ptrdiff_t increment = int32_t(j - last_j) * int32_t(mc) * int32_t(sizeof(uint16_t));
92 dmap.push_back(increment);
93 }
94 last_j = j;
95 is_first_nonzero = false;
96 nmap[buffer_index * nmap_elements + i] += 1;
97 }
98 }
99 }
100 for (uint32_t i = nc / nr; i < ncols; i++) {
101 w.push_back(bias[i]);
102 for (uint32_t j = 0; j < kc; j++) {
103 if ((b[i * kc + j] & 0x7FFF) != 0) {
104 w.push_back(b[i * kc + j]);
105 if (is_first_nonzero) {
106 first_j = j;
107 } else {
108 const ptrdiff_t increment = int32_t(j - last_j) * int32_t(mc) * int32_t(sizeof(uint16_t));
109 dmap.push_back(increment);
110 }
111 last_j = j;
112 is_first_nonzero = false;
113 nmap[buffer_index * nmap_elements + i] += 1;
114 }
115 }
116 }
117 {
118 const ptrdiff_t increment = int32_t(first_j - last_j) * int32_t(mc) * int32_t(sizeof(uint16_t));
119 dmap.push_back(increment);
120 }
121
122 a_offsets[buffer_index] = first_j * mc;
123 }
124
125 // Micro-kernel can access one element beyond w and dmap for software pipelining.
126 w.resize(w.size() + 1);
127 dmap.resize(dmap.size() + 1);
128
129 std::vector<float, AlignedAllocator<float, 64>> a(kc * mc);
130 std::vector<float, AlignedAllocator<float, 64>> c(num_buffers * c_elements);
131
132 std::generate(a.begin(), a.end(), std::ref(f32rng));
133 std::fill(c.begin(), c.end(), nanf(""));
134
Frank Barchard77acbf22020-05-01 10:08:26 -0700135 xnn_f16_scaleminmax_params params{
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800136 0x3C00 /* 1.0 */, 0x7C00 /* inf */, 0xFC00 /* -inf */};
137
138 size_t buffer_index = 0;
139 for (auto _ : state) {
140 // Use circular buffers (exceeding cache size) and prefetch to control cache state:
141 // - A is always in L1 cache (if fits, otherwise L2, L3, etc)
142 // - W, Kmap, and Nmap is not in cache (for any cache level)
143 // - C is not in cache (for any cache level)
144 state.PauseTiming();
145 benchmark::utils::PrefetchToL1(a.data(), a.size() * sizeof(uint16_t));
146 buffer_index = (buffer_index + 1) % num_buffers;
147 state.ResumeTiming();
148
149 spmm(mc, nc,
150 a.data() + a_offsets[buffer_index],
151 w.data() + buffer_index * w_elements,
152 dmap.data() + buffer_index * dmap_elements,
153 nmap.data() + buffer_index * nmap_elements,
154 c.data() + buffer_index * c_elements,
Frank Barchard77acbf22020-05-01 10:08:26 -0700155 &params);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800156 }
157
158 state.counters["Freq"] = benchmark::utils::GetCurrentCpuFrequency();
159 state.counters["FLOPS"] = benchmark::Counter(
160 uint64_t(state.iterations()) * 2 * mc * num_nonzeroes, benchmark::Counter::kIsRate);
161
162 state.counters["EffFLOPS"] = benchmark::Counter(
163 uint64_t(state.iterations()) * 2 * mc * nc * kc, benchmark::Counter::kIsRate);
164}
165
166
167#if XNN_ARCH_ARM64
168 static void spmm80_8x1__neonfp16arith(benchmark::State& state, const char* net) {
Marat Dukhan355ab432020-04-09 19:01:52 -0700169 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith, 8, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800170 }
171 static void spmm80_8x1__neonfp16arith_unroll2(benchmark::State& state, const char* net) {
Marat Dukhan355ab432020-04-09 19:01:52 -0700172 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_8x1__neonfp16arith_unroll2, 8, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800173 }
174 static void spmm80_16x1__neonfp16arith(benchmark::State& state, const char* net) {
Marat Dukhan355ab432020-04-09 19:01:52 -0700175 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith, 16, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800176 }
177 static void spmm80_16x1__neonfp16arith_unroll2(benchmark::State& state, const char* net) {
Marat Dukhan355ab432020-04-09 19:01:52 -0700178 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_16x1__neonfp16arith_unroll2, 16, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800179 }
180 static void spmm80_24x1__neonfp16arith(benchmark::State& state, const char* net) {
Marat Dukhan355ab432020-04-09 19:01:52 -0700181 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith, 24, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800182 }
183 static void spmm80_24x1__neonfp16arith_unroll2(benchmark::State& state, const char* net) {
Marat Dukhan355ab432020-04-09 19:01:52 -0700184 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_24x1__neonfp16arith_unroll2, 24, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800185 }
186 static void spmm80_32x1__neonfp16arith(benchmark::State& state, const char* net) {
Marat Dukhan355ab432020-04-09 19:01:52 -0700187 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith, 32, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800188 }
189 static void spmm80_32x1__neonfp16arith_unroll2(benchmark::State& state, const char* net) {
Marat Dukhan355ab432020-04-09 19:01:52 -0700190 SpMMBenchmark(state, xnn_f16_spmm_minmax_ukernel_32x1__neonfp16arith_unroll2, 32, 1, 0.8f);
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800191 }
192
193 BENCHMARK_GEMM(spmm80_8x1__neonfp16arith)
194 BENCHMARK_GEMM(spmm80_8x1__neonfp16arith_unroll2)
195 BENCHMARK_GEMM(spmm80_16x1__neonfp16arith)
196 BENCHMARK_GEMM(spmm80_16x1__neonfp16arith_unroll2)
197 BENCHMARK_GEMM(spmm80_24x1__neonfp16arith)
198 BENCHMARK_GEMM(spmm80_24x1__neonfp16arith_unroll2)
199 BENCHMARK_GEMM(spmm80_32x1__neonfp16arith)
200 BENCHMARK_GEMM(spmm80_32x1__neonfp16arith_unroll2)
201#endif // XNN_ARCH_ARM64
202
203#ifndef XNNPACK_BENCHMARK_NO_MAIN
204BENCHMARK_MAIN();
205#endif