blob: c7584cdb1e69ed52bd94d5388f731546003a86b7 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright 2019 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#pragma once
7
8#include <gtest/gtest.h>
9
10#include <algorithm>
11#include <cassert>
12#include <cmath>
13#include <cstddef>
14#include <cstdlib>
15#include <functional>
16#include <random>
17#include <vector>
18
19#include <xnnpack.h>
20#include <xnnpack/AlignedAllocator.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070021#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070022#include <xnnpack/params.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070023
24
Marat Dukhanbdb56f52020-02-05 21:42:49 -080025static inline bool is_fp16_zero(uint16_t x) {
26 const uint32_t ext_x = x;
27 const uint32_t two_x = ext_x + ext_x;
28 return (uint16_t) two_x == 0;
29}
30
XNNPACK Teamb455b122019-09-27 18:10:33 -070031class SpMMMicrokernelTester {
32 public:
33 enum class Variant {
34 Native,
35 Scalar,
36 };
37
38 inline SpMMMicrokernelTester& mr(size_t mr) {
39 this->mr_ = mr;
40 return *this;
41 }
42
43 inline size_t mr() const {
44 return this->mr_;
45 }
46
47 inline SpMMMicrokernelTester& nr(size_t nr) {
48 this->nr_ = nr;
49 return *this;
50 }
51
52 inline size_t nr() const {
53 return this->nr_;
54 }
55
56 inline SpMMMicrokernelTester& m(size_t m) {
57 this->m_ = m;
58 return *this;
59 }
60
61 inline size_t m() const {
62 return this->m_;
63 }
64
65 inline SpMMMicrokernelTester& n(size_t n) {
66 this->n_ = n;
67 return *this;
68 }
69
70 inline size_t n() const {
71 return this->n_;
72 }
73
74 inline SpMMMicrokernelTester& k(size_t k) {
75 this->k_ = k;
76 return *this;
77 }
78
79 inline size_t k() const {
80 return this->k_;
81 }
82
83 inline SpMMMicrokernelTester& sparsity(float sparsity) {
84 this->sparsity_ = sparsity;
85 return *this;
86 }
87
88 inline float sparsity() const {
89 return this->sparsity_;
90 }
91
92 inline SpMMMicrokernelTester& qmin(uint8_t qmin) {
93 this->qmin_ = qmin;
94 return *this;
95 }
96
97 inline uint8_t qmin() const {
98 return this->qmin_;
99 }
100
101 inline SpMMMicrokernelTester& qmax(uint8_t qmax) {
102 this->qmax_ = qmax;
103 return *this;
104 }
105
106 inline uint8_t qmax() const {
107 return this->qmax_;
108 }
109
110 inline SpMMMicrokernelTester& iterations(size_t iterations) {
111 this->iterations_ = iterations;
112 return *this;
113 }
114
115 inline size_t iterations() const {
116 return this->iterations_;
117 }
118
119 void Test(xnn_f32_spmm_ukernel_function spmm, Variant variant = Variant::Native) const {
120 ASSERT_GE(m(), 1);
121 ASSERT_GE(n(), 1);
122 ASSERT_GE(k(), 1);
123
124 std::random_device random_device;
125 auto rng = std::mt19937(random_device());
126 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
127 auto prng = std::bind(std::uniform_real_distribution<float>(), rng);
128
129 std::vector<float, AlignedAllocator<float, 64>> a(k() * m());
130 // Think of b as (n/nr + n % nr) x k, expansion happens later.
131 const size_t ncols = n() / nr() + n() % nr();
132 std::vector<float> b(ncols * k());
133 std::vector<float> bias(n());
134 // Number of non-zero weights per N (output channel).
135 std::vector<uint32_t> nmap(n());
136 // Mapping from index of non-zero weight to increment of K (input channel) following this index.
137 std::vector<int32_t> dmap(n() * k());
138 std::vector<float> w(n() * k() + n());
139 std::vector<float> c(n() * m());
140 std::vector<float> c_ref(n() * m());
141
142 for (size_t iteration = 0; iteration < iterations(); iteration++) {
143 std::generate(a.begin(), a.end(), std::ref(f32rng));
144 std::generate(b.begin(), b.end(), std::ref(f32rng));
145 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
146 std::fill(c.begin(), c.end(), nanf(""));
147 std::fill(c_ref.begin(), c_ref.end(), 0.0f);
148 std::fill(nmap.begin(), nmap.end(), 0);
149 std::fill(dmap.begin(), dmap.end(), 0);
150 std::fill(w.begin(), w.end(), 0.0f);
151
152 for (float& b_value : b) {
153 if (prng() <= sparsity()) {
154 b_value = 0.0f;
155 }
156 }
157
158 uint32_t nnz = 0;
159 uint32_t wcnt = 0;
160 size_t last_kk = 0;
161 bool first_nzz = true;
162 size_t first_kk = 0;
163 for (size_t nn = 0; nn < n() / nr(); nn++) {
164 for (size_t i = 0; i < nr(); ++i)
165 w[wcnt++] = bias[nr() * nn + i];
166 for (size_t kk = 0; kk < k(); kk++) {
167 if (b[nn * k() + kk] != 0.0f) {
168 // Every non-zero actually corresponds to nr adjacent non-zeros.
169 for (size_t i = 0; i < nr(); ++i)
170 w[wcnt++] = b[nn * k() + kk] + static_cast<float>(i);
171 // Skip the very first non-zero weight as we record only the difference.
172 if (first_nzz) {
173 first_kk = kk;
174 } else {
175 const int32_t increment = int32_t(kk - last_kk) * int32_t(m() * sizeof(float));
176 dmap[nnz++] = increment;
177 }
178 last_kk = kk;
179 first_nzz = false;
180 nmap[nn] += 1;
181 }
182 }
183 }
184
185 // now we've constructed the matrix for the blocked part and switch to the
186 // leftovers, which we do as nr=1 always.
187 for (size_t nn = n() / nr(); nn < ncols; nn++) {
188 w[wcnt++] = bias[(n() / nr()) * nr() + (nn - n() / nr())];
189 for (size_t kk = 0; kk < k(); kk++) {
190 if (b[nn * k() + kk] != 0.0f) {
191 // Every non-zero actually corresponds to nr adjacent non-zeros.
192 w[wcnt++] = b[nn * k() + kk];
193 // Skip the very first non-zero weight as we record only the difference.
194 if (first_nzz) {
195 first_kk = kk;
196 } else {
197 const int32_t increment = int32_t(kk - last_kk) * int32_t(m() * sizeof(float));
198 dmap[nnz++] = increment;
199 }
200 last_kk = kk;
201 first_nzz = false;
202 nmap[nn] += 1;
203 }
204 }
205 }
206 // In the end, we must return input pointer to the initial value.
207 const int64_t increment = int32_t(first_kk - last_kk) * int32_t(m() * sizeof(float));
208 dmap[nnz++] = increment;
209
210 // Generate expanded b which will be used in reference calculation.
211 // Everywhere there is a non-zero in the original we copy it and add an
212 // adjacent non-zero with incremented weight value.
213 std::vector<float> b_full(n() * k());
214 if (nr() == 1) {
215 b_full = b;
216 }
217 else {
218 for (size_t nn = 0; nn < n() / nr(); nn++) {
219 for (size_t kk = 0; kk < k(); kk++) {
220 if (b[nn * k() + kk] != 0.0f) {
221 for (size_t i = 0; i < nr(); ++i)
222 b_full[nr() * nn * k() + i * k() + kk] = b[nn * k() + kk] + static_cast<float>(i);
223 }
224 }
225 }
226 for (size_t nn = n() / nr(); nn < ncols; nn++) {
227 for (size_t kk = 0; kk < k(); kk++) {
228 if (b[nn * k() + kk] != 0.0f) {
229 b_full[nr() * (n() / nr()) * k() + (nn - n() / nr()) * k() + kk] = b[nn * k() + kk];
230 }
231 }
232 }
233 }
234
235 for (size_t oc = 0; oc < n(); oc++) {
236 for (size_t pxb = 0; pxb < m(); pxb++) {
237 c_ref[oc * m() + pxb] = bias[oc];
238 for (size_t ic = 0; ic < k(); ic++) {
239 c_ref[oc * m() + pxb] += a[ic * m() + pxb] * b_full[oc * k() + ic];
240 }
241 }
242 }
243
244 // Micro-kernel can access one element beyond w and dmap for software pipelining.
245 w.resize(wcnt + 1);
246 dmap.resize(nnz + 1);
247
248 // Compute clamping parameters.
249 const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
250 const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
251 const float c_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
252 const float c_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
253
254 // Clamp reference results.
255 for (float& c_value : c_ref) {
256 c_value = std::min(std::max(c_value, c_min), c_max);
257 }
258
259 // Prepare output parameters.
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700260 xnn_f32_minmax_params minmax_params = { };
XNNPACK Teamb455b122019-09-27 18:10:33 -0700261 switch (variant) {
262 case Variant::Native:
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700263 minmax_params = xnn_init_f32_minmax_params(c_min, c_max);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700264 break;
265 case Variant::Scalar:
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700266 minmax_params = xnn_init_scalar_f32_minmax_params(c_min, c_max);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700267 break;
268 }
269
270 spmm(m(), n(),
271 a.data() + first_kk * m(), w.data(), dmap.data(), nmap.data(), c.data(),
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700272 &minmax_params);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700273
274 // Validate micro-kernel outputs.
275 for (size_t pxb = 0; pxb < n(); pxb++) {
276 for (size_t oc = 0; oc < m(); oc++) {
277 ASSERT_NEAR(
278 c[pxb * m() + oc],
279 c_ref[pxb * m() + oc],
280 std::abs(c_ref[pxb * m() + oc]) * 1.0e-6f)
281 << "at " << pxb << ", " << oc
282 << ": Mr x Nr x Kr = " << mr() << " x " << nr()
283 << ", M x N x K = " << m() << " x " << n() << " x " << k();
284 }
285 }
286 }
287 }
288
Marat Dukhanbdb56f52020-02-05 21:42:49 -0800289 void Test(xnn_f16_spmm_ukernel_function spmm) const {
290 ASSERT_GE(m(), 1);
291 ASSERT_GE(n(), 1);
292 ASSERT_GE(k(), 1);
293
294 std::random_device random_device;
295 auto rng = std::mt19937(random_device());
296 auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
297 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
298 auto prng = std::bind(std::uniform_real_distribution<float>(), rng);
299
300 std::vector<uint16_t, AlignedAllocator<uint16_t, 64>> a(k() * m());
301 // Think of b as (n/nr + n % nr) x k, expansion happens later.
302 const size_t ncols = n() / nr() + n() % nr();
303 std::vector<uint16_t> b(ncols * k());
304 std::vector<uint16_t> bias(n());
305 // Number of non-zero weights per N (output channel).
306 std::vector<uint32_t> nmap(n());
307 // Mapping from index of non-zero weight to increment of K (input channel) following this index.
308 std::vector<int32_t> dmap(n() * k());
309 std::vector<uint16_t> w(n() * k() + n());
310 std::vector<uint16_t> c(n() * m());
311 std::vector<float> c_ref(n() * m());
312
313 for (size_t iteration = 0; iteration < iterations(); iteration++) {
314 std::generate(a.begin(), a.end(), std::ref(f16rng));
315 std::generate(b.begin(), b.end(), std::ref(f16rng));
316 std::generate(bias.begin(), bias.end(), std::ref(f16rng));
317 std::fill(c.begin(), c.end(), 0xC000);
318 std::fill(c_ref.begin(), c_ref.end(), 0.0f);
319 std::fill(nmap.begin(), nmap.end(), 0);
320 std::fill(dmap.begin(), dmap.end(), 0);
321 std::fill(w.begin(), w.end(), 0);
322
323 for (uint16_t& b_value : b) {
324 if (prng() <= sparsity()) {
325 b_value = 0;
326 }
327 }
328
329 uint32_t nnz = 0;
330 uint32_t wcnt = 0;
331 size_t last_kk = 0;
332 bool first_nzz = true;
333 size_t first_kk = 0;
334 for (size_t nn = 0; nn < n() / nr(); nn++) {
335 for (size_t i = 0; i < nr(); ++i)
336 w[wcnt++] = bias[nr() * nn + i];
337 for (size_t kk = 0; kk < k(); kk++) {
338 if (!is_fp16_zero(b[nn * k() + kk])) {
339 // Every non-zero actually corresponds to nr adjacent non-zeros.
340 for (size_t i = 0; i < nr(); ++i)
341 w[wcnt++] = fp16_ieee_from_fp32_value(fp16_ieee_to_fp32_value(b[nn * k() + kk]) + static_cast<float>(i));
342 // Skip the very first non-zero weight as we record only the difference.
343 if (first_nzz) {
344 first_kk = kk;
345 } else {
346 const int32_t increment = int32_t(kk - last_kk) * int32_t(m() * sizeof(uint16_t));
347 dmap[nnz++] = increment;
348 }
349 last_kk = kk;
350 first_nzz = false;
351 nmap[nn] += 1;
352 }
353 }
354 }
355
356 // now we've constructed the matrix for the blocked part and switch to the
357 // leftovers, which we do as nr=1 always.
358 for (size_t nn = n() / nr(); nn < ncols; nn++) {
359 w[wcnt++] = bias[(n() / nr()) * nr() + (nn - n() / nr())];
360 for (size_t kk = 0; kk < k(); kk++) {
361 if (!is_fp16_zero(b[nn * k() + kk])) {
362 // Every non-zero actually corresponds to nr adjacent non-zeros.
363 w[wcnt++] = b[nn * k() + kk];
364 // Skip the very first non-zero weight as we record only the difference.
365 if (first_nzz) {
366 first_kk = kk;
367 } else {
368 const int32_t increment = int32_t(kk - last_kk) * int32_t(m() * sizeof(uint16_t));
369 dmap[nnz++] = increment;
370 }
371 last_kk = kk;
372 first_nzz = false;
373 nmap[nn] += 1;
374 }
375 }
376 }
377 // In the end, we must return input pointer to the initial value.
378 const int64_t increment = int32_t(first_kk - last_kk) * int32_t(m() * sizeof(uint16_t));
379 dmap[nnz++] = increment;
380
381 // Generate expanded b which will be used in reference calculation.
382 // Everywhere there is a non-zero in the original we copy it and add an
383 // adjacent non-zero with incremented weight value.
384 std::vector<uint16_t> b_full(n() * k());
385 if (nr() == 1) {
386 b_full = b;
387 }
388 else {
389 for (size_t nn = 0; nn < n() / nr(); nn++) {
390 for (size_t kk = 0; kk < k(); kk++) {
391 if (b[nn * k() + kk] != 0.0f) {
392 for (size_t i = 0; i < nr(); ++i)
393 b_full[nr() * nn * k() + i * k() + kk] = fp16_ieee_from_fp32_value(
394 fp16_ieee_to_fp32_value(b[nn * k() + kk]) + static_cast<float>(i));
395 }
396 }
397 }
398 for (size_t nn = n() / nr(); nn < ncols; nn++) {
399 for (size_t kk = 0; kk < k(); kk++) {
400 if (b[nn * k() + kk] != 0.0f) {
401 b_full[nr() * (n() / nr()) * k() + (nn - n() / nr()) * k() + kk] = b[nn * k() + kk];
402 }
403 }
404 }
405 }
406
407 for (size_t oc = 0; oc < n(); oc++) {
408 for (size_t pxb = 0; pxb < m(); pxb++) {
409 c_ref[oc * m() + pxb] = fp16_ieee_to_fp32_value(bias[oc]);
410 for (size_t ic = 0; ic < k(); ic++) {
411 c_ref[oc * m() + pxb] += fp16_ieee_to_fp32_value(a[ic * m() + pxb]) * fp16_ieee_to_fp32_value(b_full[oc * k() + ic]);
412 }
413 }
414 }
415
416 // Micro-kernel can access one element beyond w and dmap for software pipelining.
417 w.resize(wcnt + 1);
418 dmap.resize(nnz + 1);
419
420 // Compute clamping parameters.
421 const float accumulated_min = *std::min_element(c_ref.cbegin(), c_ref.cend());
422 const float accumulated_max = *std::max_element(c_ref.cbegin(), c_ref.cend());
423 const float c_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
424 const float c_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
425
426 // Clamp reference results.
427 for (float& c_value : c_ref) {
428 c_value = std::min(std::max(c_value, c_min), c_max);
429 }
430
431 // Prepare output parameters.
432 xnn_f16_output_params output_params;
433 output_params.scale = UINT16_C(0x3C00) /* 1.0 */;
434 output_params.max = fp16_ieee_from_fp32_value(c_max);
435 output_params.min = fp16_ieee_from_fp32_value(c_min);
436
437 spmm(m(), n(),
438 a.data() + first_kk * m(), w.data(), dmap.data(), nmap.data(), c.data(),
439 &output_params);
440
441 // Validate micro-kernel outputs.
442 for (size_t pxb = 0; pxb < n(); pxb++) {
443 for (size_t oc = 0; oc < m(); oc++) {
444 ASSERT_NEAR(
445 fp16_ieee_to_fp32_value(c[pxb * m() + oc]),
446 c_ref[pxb * m() + oc],
447 std::abs(c_ref[pxb * m() + oc]) * 1.0e-2f)
448 << "at " << pxb << ", " << oc
449 << ": Mr x Nr x Kr = " << mr() << " x " << nr()
450 << ", M x N x K = " << m() << " x " << n() << " x " << k();
451 }
452 }
453 }
454 }
455
XNNPACK Teamb455b122019-09-27 18:10:33 -0700456 private:
457 size_t mr_{1};
458 size_t nr_{1};
459 size_t m_{1};
460 size_t n_{1};
461 size_t k_{1};
462 float sparsity_{0.5f};
463 uint8_t qmin_{0};
464 uint8_t qmax_{255};
465 size_t iterations_{1};
466};