blob: 218cf37109c459171ddd6e43f5f821af664f19da [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#pragma once
10
11#include <gtest/gtest.h>
12
13#include <algorithm>
14#include <cassert>
15#include <cmath>
16#include <cstddef>
17#include <cstdlib>
18#include <functional>
Marat Dukhan5ce30d92020-04-14 03:31:26 -070019#include <limits>
XNNPACK Teamb455b122019-09-27 18:10:33 -070020#include <random>
21#include <vector>
22
Frank Barchard49b4dcc2020-06-26 14:07:19 -070023#include <fp16.h>
24
XNNPACK Teamb455b122019-09-27 18:10:33 -070025#include <xnnpack.h>
26
27
28class ConvolutionOperatorTester {
29 public:
Marat Dukhan6989ec42022-01-14 17:14:35 -080030 enum class WeightsType {
31 Default,
32 FP32,
33 };
34
Marat Dukhan8440fde2019-10-24 12:46:13 -070035 inline ConvolutionOperatorTester& padding_tf_same(bool padding_same) {
36 if (padding_same) {
37 assert(padding_top() == 0);
38 assert(padding_left() == 0);
39 assert(padding_bottom() == 0);
40 assert(padding_right() == 0);
41 }
42 this->padding_tf_same_ = padding_same;
43 return *this;
44 }
45
46 inline bool padding_tf_same() const {
47 return this->padding_tf_same_;
48 }
49
XNNPACK Teamb455b122019-09-27 18:10:33 -070050 inline ConvolutionOperatorTester& padding(uint32_t padding) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070051 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070052 this->padding_top_ = padding;
53 this->padding_right_ = padding;
54 this->padding_bottom_ = padding;
55 this->padding_left_ = padding;
56 return *this;
57 }
58
59 inline ConvolutionOperatorTester& padding(uint32_t padding_height, uint32_t padding_width) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070060 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070061 this->padding_top_ = padding_height;
62 this->padding_right_ = padding_width;
63 this->padding_bottom_ = padding_height;
64 this->padding_left_ = padding_width;
65 return *this;
66 }
67
68 inline ConvolutionOperatorTester& padding_height(uint32_t padding_height) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070069 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070070 this->padding_top_ = padding_height;
71 this->padding_bottom_ = padding_height;
72 return *this;
73 }
74
75 inline ConvolutionOperatorTester& padding_width(uint32_t padding_width) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070076 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070077 this->padding_right_ = padding_width;
78 this->padding_left_ = padding_width;
79 return *this;
80 }
81
82 inline ConvolutionOperatorTester& padding_top(uint32_t padding_top) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070083 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070084 this->padding_top_ = padding_top;
85 return *this;
86 }
87
88 inline uint32_t padding_top() const {
Marat Dukhan8440fde2019-10-24 12:46:13 -070089 if (padding_tf_same()) {
90 const uint32_t total_padding_height =
91 (output_height() - 1) * subsampling_height() + dilated_kernel_height() - input_height();
92 return total_padding_height / 2;
93 } else {
94 return this->padding_top_;
95 }
XNNPACK Teamb455b122019-09-27 18:10:33 -070096 }
97
98 inline ConvolutionOperatorTester& padding_left(uint32_t padding_left) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070099 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -0700100 this->padding_left_ = padding_left;
101 return *this;
102 }
103
104 inline uint32_t padding_left() const {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700105 if (padding_tf_same()) {
106 const uint32_t total_padding_width =
107 (output_width() - 1) * subsampling_width() + dilated_kernel_width() - input_width();
108 return total_padding_width / 2;
109 } else {
110 return this->padding_left_;
111 }
112 }
113
114 inline ConvolutionOperatorTester& padding_bottom(uint32_t padding_bottom) {
115 assert(!padding_tf_same());
116 this->padding_bottom_ = padding_bottom;
117 return *this;
118 }
119
120 inline uint32_t padding_bottom() const {
121 if (padding_tf_same()) {
122 const uint32_t total_padding_height =
123 (output_height() - 1) * subsampling_height() + dilated_kernel_height() - input_height();
124 return total_padding_height - total_padding_height / 2;
125 } else {
126 return this->padding_bottom_;
127 }
128 }
129
130 inline ConvolutionOperatorTester& padding_right(uint32_t padding_right) {
131 assert(!padding_tf_same());
132 this->padding_right_ = padding_right;
133 return *this;
134 }
135
136 inline uint32_t padding_right() const {
137 if (padding_tf_same()) {
138 const uint32_t total_padding_width =
139 (output_width() - 1) * subsampling_width() + dilated_kernel_width() - input_width();
140 return total_padding_width - total_padding_width / 2;
141 } else {
142 return this->padding_right_;
143 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700144 }
145
146 inline ConvolutionOperatorTester& input_size(uint32_t input_height, uint32_t input_width) {
147 assert(input_height >= 1);
148 assert(input_width >= 1);
149 this->input_height_ = input_height;
150 this->input_width_ = input_width;
151 return *this;
152 }
153
154 inline ConvolutionOperatorTester& input_height(uint32_t input_height) {
155 assert(input_height >= 1);
156 this->input_height_ = input_height;
157 return *this;
158 }
159
160 inline uint32_t input_height() const {
161 return this->input_height_;
162 }
163
164 inline ConvolutionOperatorTester& input_width(uint32_t input_width) {
165 assert(input_width >= 1);
166 this->input_width_ = input_width;
167 return *this;
168 }
169
170 inline uint32_t input_width() const {
171 return this->input_width_;
172 }
173
174 inline ConvolutionOperatorTester& groups(uint32_t groups) {
175 assert(groups >= 1);
176 this->groups_ = groups;
177 return *this;
178 }
179
180 inline uint32_t groups() const {
181 return this->groups_;
182 }
183
184 inline ConvolutionOperatorTester& group_input_channels(size_t group_input_channels) {
185 assert(group_input_channels >= 1);
186 this->group_input_channels_ = group_input_channels;
187 return *this;
188 }
189
190 inline size_t group_input_channels() const {
191 return this->group_input_channels_;
192 }
193
194 inline ConvolutionOperatorTester& group_output_channels(size_t group_output_channels) {
195 assert(group_output_channels >= 1);
196 this->group_output_channels_ = group_output_channels;
197 return *this;
198 }
199
200 inline size_t group_output_channels() const {
201 return this->group_output_channels_;
202 }
203
204 inline ConvolutionOperatorTester& batch_size(size_t batch_size) {
205 assert(batch_size >= 1);
206 this->batch_size_ = batch_size;
207 return *this;
208 }
209
210 inline size_t batch_size() const {
211 return this->batch_size_;
212 }
213
214 inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_size) {
215 assert(kernel_size >= 1);
216 this->kernel_height_ = kernel_size;
217 this->kernel_width_ = kernel_size;
218 return *this;
219 }
220
221 inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_height, uint32_t kernel_width) {
222 assert(kernel_height >= 1);
223 assert(kernel_width >= 1);
224 this->kernel_height_ = kernel_height;
225 this->kernel_width_ = kernel_width;
226 return *this;
227 }
228
229 inline ConvolutionOperatorTester& kernel_height(uint32_t kernel_height) {
230 assert(kernel_height >= 1);
231 this->kernel_height_ = kernel_height;
232 return *this;
233 }
234
235 inline uint32_t kernel_height() const {
236 return this->kernel_height_;
237 }
238
239 inline ConvolutionOperatorTester& kernel_width(uint32_t kernel_width) {
240 assert(kernel_width >= 1);
241 this->kernel_width_ = kernel_width;
242 return *this;
243 }
244
245 inline uint32_t kernel_width() const {
246 return this->kernel_width_;
247 }
248
249 inline ConvolutionOperatorTester& dilation(uint32_t dilation) {
250 assert(dilation >= 1);
251 this->dilation_height_ = dilation;
252 this->dilation_width_ = dilation;
253 return *this;
254 }
255
256 inline ConvolutionOperatorTester& dilation(uint32_t dilation_height, uint32_t dilation_width) {
257 assert(dilation_height >= 1);
258 assert(dilation_width >= 1);
259 this->dilation_height_ = dilation_height;
260 this->dilation_width_ = dilation_width;
261 return *this;
262 }
263
264 inline ConvolutionOperatorTester& dilation_height(uint32_t dilation_height) {
265 assert(dilation_height >= 1);
266 this->dilation_height_ = dilation_height;
267 return *this;
268 }
269
270 inline uint32_t dilation_height() const {
271 return this->dilation_height_;
272 }
273
274 inline ConvolutionOperatorTester& dilation_width(uint32_t dilation_width) {
275 assert(dilation_width >= 1);
276 this->dilation_width_ = dilation_width;
277 return *this;
278 }
279
280 inline uint32_t dilation_width() const {
281 return this->dilation_width_;
282 }
283
284 inline ConvolutionOperatorTester& subsampling(uint32_t subsampling) {
285 assert(subsampling >= 1);
286 this->subsampling_height_ = subsampling;
287 this->subsampling_width_ = subsampling;
288 return *this;
289 }
290
291 inline ConvolutionOperatorTester& subsampling(uint32_t subsampling_height, uint32_t subsampling_width) {
292 assert(subsampling_height >= 1);
293 assert(subsampling_width >= 1);
294 this->subsampling_height_ = subsampling_height;
295 this->subsampling_width_ = subsampling_width;
296 return *this;
297 }
298
299 inline ConvolutionOperatorTester& subsampling_height(uint32_t subsampling_height) {
300 assert(subsampling_height >= 1);
301 this->subsampling_height_ = subsampling_height;
302 return *this;
303 }
304
305 inline uint32_t subsampling_height() const {
306 return this->subsampling_height_;
307 }
308
309 inline ConvolutionOperatorTester& subsampling_width(uint32_t subsampling_width) {
310 assert(subsampling_width >= 1);
311 this->subsampling_width_ = subsampling_width;
312 return *this;
313 }
314
315 inline uint32_t subsampling_width() const {
316 return this->subsampling_width_;
317 }
318
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700319 inline ConvolutionOperatorTester& input_channel_stride(size_t input_channel_stride) {
320 assert(input_channel_stride >= 1);
321 this->input_channel_stride_ = input_channel_stride;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700322 return *this;
323 }
324
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700325 inline size_t input_channel_stride() const {
326 if (this->input_channel_stride_ == 0) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700327 return group_input_channels() * groups();
328 } else {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700329 assert(this->input_channel_stride_ >= group_input_channels() * groups());
330 return this->input_channel_stride_;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700331 }
332 }
333
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700334 inline ConvolutionOperatorTester& output_channel_stride(size_t output_channel_stride) {
335 assert(output_channel_stride >= 1);
336 this->output_channel_stride_ = output_channel_stride;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700337 return *this;
338 }
339
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700340 inline size_t output_channel_stride() const {
341 if (this->output_channel_stride_ == 0) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700342 return group_output_channels() * groups();
343 } else {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700344 assert(this->output_channel_stride_ >= group_output_channels() * groups());
345 return this->output_channel_stride_;
Marat Dukhanefc47b82019-11-18 09:25:38 -0800346 }
347 }
348
XNNPACK Teamb455b122019-09-27 18:10:33 -0700349 inline uint32_t dilated_kernel_height() const {
350 return (kernel_height() - 1) * dilation_height() + 1;
351 }
352
353 inline uint32_t dilated_kernel_width() const {
354 return (kernel_width() - 1) * dilation_width() + 1;
355 }
356
357 inline size_t output_height() const {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700358 if (padding_tf_same()) {
359 return (input_height() + subsampling_height() - 1) / subsampling_height();
XNNPACK Teamb455b122019-09-27 18:10:33 -0700360 } else {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700361 const size_t padded_input_height = padding_top() + input_height() + padding_bottom();
362 if (padded_input_height <= dilated_kernel_height()) {
363 return 1;
364 } else {
365 return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1;
366 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700367 }
368 }
369
370 inline size_t output_width() const {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700371 if (padding_tf_same()) {
372 return (input_width() + subsampling_width() - 1) / subsampling_width();
XNNPACK Teamb455b122019-09-27 18:10:33 -0700373 } else {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700374 const size_t padded_input_width = padding_left() + input_width() + padding_right();
375 if (padded_input_width <= dilated_kernel_width()) {
376 return 1;
377 } else {
378 return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1;
379 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700380 }
381 }
382
383 inline ConvolutionOperatorTester& next_input_size(uint32_t next_input_height, uint32_t next_input_width) {
384 assert(next_input_height >= 1);
385 assert(next_input_width >= 1);
386 this->next_input_height_ = next_input_height;
387 this->next_input_width_ = next_input_width;
388 return *this;
389 }
390
391 inline ConvolutionOperatorTester& next_input_height(uint32_t next_input_height) {
392 assert(next_input_height >= 1);
393 this->next_input_height_ = next_input_height;
394 return *this;
395 }
396
397 inline uint32_t next_input_height() const {
398 if (this->next_input_height_ == 0) {
399 return input_height();
400 } else {
401 return this->next_input_height_;
402 }
403 }
404
405 inline ConvolutionOperatorTester& next_input_width(uint32_t next_input_width) {
406 assert(next_input_width >= 1);
407 this->next_input_width_ = next_input_width;
408 return *this;
409 }
410
411 inline uint32_t next_input_width() const {
412 if (this->next_input_width_ == 0) {
413 return input_width();
414 } else {
415 return this->next_input_width_;
416 }
417 }
418
419 inline size_t next_output_height() const {
420 const size_t padded_input_height = padding_top() + next_input_height() + padding_bottom();
421 if (padded_input_height <= dilated_kernel_height()) {
422 return 1;
423 } else {
424 return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1;
425 }
426 }
427
428 inline size_t next_output_width() const {
429 const size_t padded_input_width = padding_left() + next_input_width() + padding_right();
430 if (padded_input_width <= dilated_kernel_width()) {
431 return 1;
432 } else {
433 return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1;
434 }
435 }
436
437 inline ConvolutionOperatorTester& next_batch_size(size_t next_batch_size) {
438 assert(next_batch_size >= 1);
439 this->next_batch_size_ = next_batch_size;
440 return *this;
441 }
442
443 inline size_t next_batch_size() const {
444 if (this->next_batch_size_ == 0) {
445 return batch_size();
446 } else {
447 return this->next_batch_size_;
448 }
449 }
450
Marat Dukhanefc47b82019-11-18 09:25:38 -0800451 inline ConvolutionOperatorTester& sparsity(float sparsity) {
452 this->sparsity_ = sparsity;
453 return *this;
454 }
455
456 inline float sparsity() const {
457 return this->sparsity_;
458 }
459
XNNPACK Teamb455b122019-09-27 18:10:33 -0700460 inline ConvolutionOperatorTester& qmin(uint8_t qmin) {
461 this->qmin_ = qmin;
462 return *this;
463 }
464
465 inline uint8_t qmin() const {
466 return this->qmin_;
467 }
468
469 inline ConvolutionOperatorTester& qmax(uint8_t qmax) {
470 this->qmax_ = qmax;
471 return *this;
472 }
473
474 inline uint8_t qmax() const {
475 return this->qmax_;
476 }
477
Marat Dukhanefc47b82019-11-18 09:25:38 -0800478 inline ConvolutionOperatorTester& force_nhwc_input(bool force_nhwc_input) {
479 this->force_nhwc_input_ = force_nhwc_input;
480 return *this;
481 }
482
483 inline bool force_nhwc_input() const {
484 return this->force_nhwc_input_;
485 }
486
XNNPACK Teamb455b122019-09-27 18:10:33 -0700487 inline ConvolutionOperatorTester& depthwise_layout(bool depthwise_layout) {
488 this->depthwise_layout_ = depthwise_layout;
489 return *this;
490 }
491
492 inline bool depthwise_layout() const {
493 return this->depthwise_layout_;
494 }
495
Marat Dukhanf568f082019-10-30 09:47:07 -0700496 inline ConvolutionOperatorTester& has_bias(bool has_bias) {
497 this->has_bias_ = has_bias;
498 return *this;
499 }
500
501 inline bool has_bias() const {
502 return this->has_bias_;
503 }
504
Marat Dukhan6989ec42022-01-14 17:14:35 -0800505 inline ConvolutionOperatorTester& weights_type(WeightsType weights_type) {
506 this->weights_type_ = weights_type;
507 return *this;
508 }
509
510 inline WeightsType weights_type() const {
511 return this->weights_type_;
512 }
513
XNNPACK Teamb455b122019-09-27 18:10:33 -0700514 inline ConvolutionOperatorTester& iterations(size_t iterations) {
515 this->iterations_ = iterations;
516 return *this;
517 }
518
519 inline size_t iterations() const {
520 return this->iterations_;
521 }
522
Marat Dukhan97262462021-06-18 16:14:17 -0700523 void TestNHWCxQC8() const {
Marat Dukhan6989ec42022-01-14 17:14:35 -0800524 ASSERT_EQ(weights_type(), WeightsType::Default);
525
Marat Dukhan97262462021-06-18 16:14:17 -0700526 std::random_device random_device;
527 auto rng = std::mt19937(random_device());
Marat Dukhan57c78272021-08-10 22:20:20 -0700528 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
Marat Dukhan97262462021-06-18 16:14:17 -0700529 auto i8rng = std::bind(
Marat Dukhan57c78272021-08-10 22:20:20 -0700530 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
531 std::ref(rng));
532 auto w8rng = std::bind(
533 std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()),
534 std::ref(rng));
Marat Dukhan97262462021-06-18 16:14:17 -0700535
536 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) +
537 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
538 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
539 std::vector<int32_t> bias(groups() * group_output_channels());
540 std::vector<int8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
541 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
542 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
543 std::vector<float> requantization_scales(groups() * group_output_channels());
544
545 const int8_t input_zero_point = -1;
546 const int8_t output_zero_point = -1;
547
548 for (size_t iteration = 0; iteration < iterations(); iteration++) {
549 std::generate(input.begin(), input.end(), std::ref(i8rng));
Marat Dukhan57c78272021-08-10 22:20:20 -0700550 std::generate(kernel.begin(), kernel.end(), std::ref(w8rng));
Marat Dukhan97262462021-06-18 16:14:17 -0700551 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
552 std::fill(output.begin(), output.end(), 0xA5);
553
554 // Compute reference results, without renormalization.
555 if (has_bias()) {
556 for (size_t i = 0; i < batch_size(); i++) {
557 for (size_t oy = 0; oy < output_height(); oy++) {
558 for (size_t ox = 0; ox < output_width(); ox++) {
559 for (size_t g = 0; g < groups(); g++) {
560 for (size_t oc = 0; oc < group_output_channels(); oc++) {
561 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
562 bias[g * group_output_channels() + oc];
563 }
564 }
565 }
566 }
567 }
568 } else {
569 std::fill(accumulators.begin(), accumulators.end(), 0);
570 }
571 if (depthwise_layout()) {
572 ASSERT_EQ(group_input_channels(), 1);
573
574 for (size_t i = 0; i < batch_size(); i++) {
575 for (size_t oy = 0; oy < output_height(); oy++) {
576 for (size_t ox = 0; ox < output_width(); ox++) {
577 for (size_t ky = 0; ky < kernel_height(); ky++) {
578 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
579 if (iy < input_height()) {
580 for (size_t kx = 0; kx < kernel_width(); kx++) {
581 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
582 if (ix < input_width()) {
583 for (size_t g = 0; g < groups(); g++) {
584 for (size_t oc = 0; oc < group_output_channels(); oc++) {
585 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
586 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g]) - int32_t(input_zero_point)) *
587 int32_t(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]);
588 }
589 }
590 }
591 }
592 }
593 }
594 }
595 }
596 }
597 } else {
598 for (size_t i = 0; i < batch_size(); i++) {
599 for (size_t oy = 0; oy < output_height(); oy++) {
600 for (size_t ox = 0; ox < output_width(); ox++) {
601 for (size_t ky = 0; ky < kernel_height(); ky++) {
602 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
603 if (iy < input_height()) {
604 for (size_t kx = 0; kx < kernel_width(); kx++) {
605 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
606 if (ix < input_width()) {
607 for (size_t g = 0; g < groups(); g++) {
608 for (size_t oc = 0; oc < group_output_channels(); oc++) {
609 for (size_t ic = 0; ic < group_input_channels(); ic++) {
610 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
611 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
612 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
613 }
614 }
615 }
616 }
617 }
618 }
619 }
620 }
621 }
622 }
623 }
624
625 // Compute renormalization parameters.
626 for (size_t c = 0; c < groups() * group_output_channels(); c++) {
627 int32_t accumulated_min = accumulators[c];
628 int32_t accumulated_max = accumulators[c];
629 for (size_t px = 0; px < batch_size() * output_height() * output_width(); px++) {
630 accumulated_min = std::min(accumulated_min, accumulators[px * groups() * group_output_channels() + c]);
631 accumulated_max = std::max(accumulated_max, accumulators[px * groups() * group_output_channels() + c]);
632 }
633
634 float requantization_scale = 0x1.0p-32f;
635 if (accumulated_max != 0) {
636 requantization_scale = std::max(requantization_scale,
637 float(int32_t(std::numeric_limits<int8_t>::max()) - int32_t(output_zero_point)) / float(accumulated_max));
638 }
639 if (accumulated_min != 0) {
640 requantization_scale = std::max(requantization_scale,
641 float(int32_t(std::numeric_limits<int8_t>::min()) - int32_t(output_zero_point)) / float(accumulated_min));
642 }
643 requantization_scale = std::min(requantization_scale, 0x1.FFFFFEp-1f);
644
645 requantization_scales[c] = requantization_scale;
646 }
647
648 // Renormalize reference results.
649 for (size_t c = 0; c < groups() * group_output_channels(); c++) {
650 for (size_t px = 0; px < batch_size() * output_height() * output_width(); px++) {
651 output_ref[px * groups() * group_output_channels() + c] = double(int32_t(output_zero_point)) +
652 double(accumulators[px * groups() * group_output_channels() + c]) * double(requantization_scales[c]);
653 }
654 }
655 std::transform(output_ref.cbegin(), output_ref.cend(), output_ref.begin(),
656 [this](double x) -> double {
657 return std::max<double>(std::min<double>(x, double(qmax() - 0x80)), double(qmin() - 0x80));
658 });
659
660 // Create, setup, run, and destroy Convolution operator.
661 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
662 xnn_operator_t convolution_op = nullptr;
663
664 xnn_status status = xnn_create_convolution2d_nhwc_qc8(
665 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
666 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
667 kernel_height(), kernel_width(),
668 subsampling_height(), subsampling_width(),
669 dilation_height(), dilation_width(),
670 groups(), group_input_channels(), group_output_channels(),
671 input_channel_stride(), output_channel_stride(),
672 input_zero_point, 1.0f /* input scale */, requantization_scales.data(),
673 kernel.data(), has_bias() ? bias.data() : nullptr,
674 output_zero_point, 1.0f /* output scale */, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
675 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
676 &convolution_op);
677 if (status == xnn_status_unsupported_hardware) {
678 GTEST_SKIP();
679 }
680 ASSERT_EQ(xnn_status_success, status);
681 ASSERT_NE(nullptr, convolution_op);
682
683 // Smart pointer to automatically delete convolution_op.
684 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
685
686 ASSERT_EQ(xnn_status_success,
687 xnn_setup_convolution2d_nhwc_qc8(
688 convolution_op,
689 batch_size(), input_height(), input_width(),
690 input.data(), output.data(),
691 nullptr /* thread pool */));
692
693 ASSERT_EQ(xnn_status_success,
694 xnn_run_operator(convolution_op, nullptr /* thread pool */));
695
696 // Verify results.
697 for (size_t i = 0; i < batch_size(); i++) {
698 for (size_t y = 0; y < output_height(); y++) {
699 for (size_t x = 0; x < output_width(); x++) {
700 for (size_t g = 0; g < groups(); g++) {
701 for (size_t c = 0; c < group_output_channels(); c++) {
702 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
703 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
704 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
705 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
706 ASSERT_NEAR(
707 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
708 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]),
709 0.9)
710 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
711 }
712 }
713 }
714 }
715 }
716 }
717 }
718
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700719 void TestNHWCxQS8() const {
Marat Dukhan6989ec42022-01-14 17:14:35 -0800720 ASSERT_EQ(weights_type(), WeightsType::Default);
721
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700722 std::random_device random_device;
723 auto rng = std::mt19937(random_device());
Marat Dukhan57c78272021-08-10 22:20:20 -0700724 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700725 auto i8rng = std::bind(
Marat Dukhan57c78272021-08-10 22:20:20 -0700726 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
727 std::ref(rng));
728 auto w8rng = std::bind(
729 std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()),
730 std::ref(rng));
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700731
732 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) +
Marat Dukhan97262462021-06-18 16:14:17 -0700733 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700734 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
735 std::vector<int32_t> bias(groups() * group_output_channels());
736 std::vector<int8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
737 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
738 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
739
740 const int8_t input_zero_point = -1;
741
742 for (size_t iteration = 0; iteration < iterations(); iteration++) {
743 std::generate(input.begin(), input.end(), std::ref(i8rng));
Marat Dukhan57c78272021-08-10 22:20:20 -0700744 std::generate(kernel.begin(), kernel.end(), std::ref(w8rng));
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700745 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
746 std::fill(output.begin(), output.end(), 0xA5);
747
748 // Compute reference results, without renormalization.
749 if (has_bias()) {
750 for (size_t i = 0; i < batch_size(); i++) {
751 for (size_t oy = 0; oy < output_height(); oy++) {
752 for (size_t ox = 0; ox < output_width(); ox++) {
753 for (size_t g = 0; g < groups(); g++) {
754 for (size_t oc = 0; oc < group_output_channels(); oc++) {
755 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
756 bias[g * group_output_channels() + oc];
757 }
758 }
759 }
760 }
761 }
762 } else {
763 std::fill(accumulators.begin(), accumulators.end(), 0);
764 }
765 if (depthwise_layout()) {
766 ASSERT_EQ(group_input_channels(), 1);
767
768 for (size_t i = 0; i < batch_size(); i++) {
769 for (size_t oy = 0; oy < output_height(); oy++) {
770 for (size_t ox = 0; ox < output_width(); ox++) {
771 for (size_t ky = 0; ky < kernel_height(); ky++) {
772 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
773 if (iy < input_height()) {
774 for (size_t kx = 0; kx < kernel_width(); kx++) {
775 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
776 if (ix < input_width()) {
777 for (size_t g = 0; g < groups(); g++) {
778 for (size_t oc = 0; oc < group_output_channels(); oc++) {
779 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
780 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g]) - int32_t(input_zero_point)) *
781 int32_t(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]);
782 }
783 }
784 }
785 }
786 }
787 }
788 }
789 }
790 }
791 } else {
792 for (size_t i = 0; i < batch_size(); i++) {
793 for (size_t oy = 0; oy < output_height(); oy++) {
794 for (size_t ox = 0; ox < output_width(); ox++) {
795 for (size_t ky = 0; ky < kernel_height(); ky++) {
796 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
797 if (iy < input_height()) {
798 for (size_t kx = 0; kx < kernel_width(); kx++) {
799 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
800 if (ix < input_width()) {
801 for (size_t g = 0; g < groups(); g++) {
802 for (size_t oc = 0; oc < group_output_channels(); oc++) {
803 for (size_t ic = 0; ic < group_input_channels(); ic++) {
804 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
805 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
806 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
807 }
808 }
809 }
810 }
811 }
812 }
813 }
814 }
815 }
816 }
817 }
818
819 // Compute renormalization parameters.
820 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
821 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
822
823 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
824 const int8_t output_zero_point = int8_t(std::max(std::min(
825 lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
826 long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
827
828 // Renormalize reference results.
829 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
830 [this, output_scale, output_zero_point](int32_t x) -> double {
831 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
832 });
833
834 // Create, setup, run, and destroy Convolution operator.
835 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
836 xnn_operator_t convolution_op = nullptr;
837
838 xnn_status status = xnn_create_convolution2d_nhwc_qs8(
839 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
840 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
841 kernel_height(), kernel_width(),
842 subsampling_height(), subsampling_width(),
843 dilation_height(), dilation_width(),
844 groups(), group_input_channels(), group_output_channels(),
845 input_channel_stride(), output_channel_stride(),
846 input_zero_point, 1.0f /* input scale */, 1.0f /* kernel scale */,
847 kernel.data(), has_bias() ? bias.data() : nullptr,
848 output_zero_point, output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
849 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
850 &convolution_op);
851 if (status == xnn_status_unsupported_hardware) {
852 GTEST_SKIP();
853 }
854 ASSERT_EQ(xnn_status_success, status);
855 ASSERT_NE(nullptr, convolution_op);
856
857 // Smart pointer to automatically delete convolution_op.
858 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
859
860 ASSERT_EQ(xnn_status_success,
861 xnn_setup_convolution2d_nhwc_qs8(
862 convolution_op,
863 batch_size(), input_height(), input_width(),
864 input.data(), output.data(),
865 nullptr /* thread pool */));
866
867 ASSERT_EQ(xnn_status_success,
868 xnn_run_operator(convolution_op, nullptr /* thread pool */));
869
870 // Verify results.
871 for (size_t i = 0; i < batch_size(); i++) {
872 for (size_t y = 0; y < output_height(); y++) {
873 for (size_t x = 0; x < output_width(); x++) {
874 for (size_t g = 0; g < groups(); g++) {
875 for (size_t c = 0; c < group_output_channels(); c++) {
876 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
877 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
878 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
879 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
880 ASSERT_NEAR(
881 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
882 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
883 0.9)
884 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
885 }
886 }
887 }
888 }
889 }
890 }
891 }
892
Marat Dukhan08b7a972020-07-14 18:17:29 -0700893 void TestNHWCxQU8() const {
Marat Dukhan6989ec42022-01-14 17:14:35 -0800894 ASSERT_EQ(weights_type(), WeightsType::Default);
895
XNNPACK Teamb455b122019-09-27 18:10:33 -0700896 std::random_device random_device;
897 auto rng = std::mt19937(random_device());
Marat Dukhan57c78272021-08-10 22:20:20 -0700898 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
899 auto u8rng = std::bind(
900 std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700901
902 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
Marat Dukhan97262462021-06-18 16:14:17 -0700903 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700904 std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
905 std::vector<int32_t> bias(groups() * group_output_channels());
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700906 std::vector<uint8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700907 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
908 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
909
910 const uint8_t input_zero_point = 127;
911 const uint8_t kernel_zero_point = 127;
912
913 for (size_t iteration = 0; iteration < iterations(); iteration++) {
914 std::generate(input.begin(), input.end(), std::ref(u8rng));
915 std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
Marat Dukhanecd83112020-08-03 21:50:28 -0700916 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700917 std::fill(output.begin(), output.end(), 0xA5);
918
919 // Compute reference results, without renormalization.
Marat Dukhanf568f082019-10-30 09:47:07 -0700920 if (has_bias()) {
921 for (size_t i = 0; i < batch_size(); i++) {
922 for (size_t oy = 0; oy < output_height(); oy++) {
923 for (size_t ox = 0; ox < output_width(); ox++) {
924 for (size_t g = 0; g < groups(); g++) {
925 for (size_t oc = 0; oc < group_output_channels(); oc++) {
926 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
927 bias[g * group_output_channels() + oc];
928 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700929 }
930 }
931 }
932 }
Marat Dukhanf568f082019-10-30 09:47:07 -0700933 } else {
934 std::fill(accumulators.begin(), accumulators.end(), 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700935 }
936 if (depthwise_layout()) {
937 ASSERT_EQ(group_input_channels(), 1);
938
939 for (size_t i = 0; i < batch_size(); i++) {
940 for (size_t oy = 0; oy < output_height(); oy++) {
941 for (size_t ox = 0; ox < output_width(); ox++) {
942 for (size_t ky = 0; ky < kernel_height(); ky++) {
943 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
944 if (iy < input_height()) {
945 for (size_t kx = 0; kx < kernel_width(); kx++) {
946 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
947 if (ix < input_width()) {
948 for (size_t g = 0; g < groups(); g++) {
949 for (size_t oc = 0; oc < group_output_channels(); oc++) {
950 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700951 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g]) - int32_t(input_zero_point)) *
XNNPACK Teamb455b122019-09-27 18:10:33 -0700952 (int32_t(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]) - int32_t(kernel_zero_point));
953 }
954 }
955 }
956 }
957 }
958 }
959 }
960 }
961 }
962 } else {
963 for (size_t i = 0; i < batch_size(); i++) {
964 for (size_t oy = 0; oy < output_height(); oy++) {
965 for (size_t ox = 0; ox < output_width(); ox++) {
966 for (size_t ky = 0; ky < kernel_height(); ky++) {
967 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
968 if (iy < input_height()) {
969 for (size_t kx = 0; kx < kernel_width(); kx++) {
970 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
971 if (ix < input_width()) {
972 for (size_t g = 0; g < groups(); g++) {
973 for (size_t oc = 0; oc < group_output_channels(); oc++) {
974 for (size_t ic = 0; ic < group_input_channels(); ic++) {
975 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700976 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
XNNPACK Teamb455b122019-09-27 18:10:33 -0700977 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
978 }
979 }
980 }
981 }
982 }
983 }
984 }
985 }
986 }
987 }
988 }
989
990 // Compute renormalization parameters.
991 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
992 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
993
994 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
995 const uint8_t output_zero_point = uint8_t(std::max(std::min(
996 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
997 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
998
999 // Renormalize reference results.
1000 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
1001 [this, output_scale, output_zero_point](int32_t x) -> double {
1002 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
1003 });
1004
1005 // Create, setup, run, and destroy Convolution operator.
Marat Dukhan04f03be2019-11-19 12:36:47 -08001006 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
XNNPACK Teamb455b122019-09-27 18:10:33 -07001007 xnn_operator_t convolution_op = nullptr;
1008
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001009 xnn_status status = xnn_create_convolution2d_nhwc_qu8(
Marat Dukhan8440fde2019-10-24 12:46:13 -07001010 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
1011 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
XNNPACK Teamb455b122019-09-27 18:10:33 -07001012 kernel_height(), kernel_width(),
1013 subsampling_height(), subsampling_width(),
1014 dilation_height(), dilation_width(),
1015 groups(), group_input_channels(), group_output_channels(),
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001016 input_channel_stride(), output_channel_stride(),
XNNPACK Teamb455b122019-09-27 18:10:33 -07001017 input_zero_point, 1.0f /* input scale */,
1018 kernel_zero_point, 1.0f /* kernel scale */,
Marat Dukhanf568f082019-10-30 09:47:07 -07001019 kernel.data(), has_bias() ? bias.data() : nullptr,
XNNPACK Teamb455b122019-09-27 18:10:33 -07001020 output_zero_point, output_scale, qmin(), qmax(),
Marat Dukhan8440fde2019-10-24 12:46:13 -07001021 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001022 &convolution_op);
1023 if (status == xnn_status_unsupported_hardware) {
1024 GTEST_SKIP();
1025 }
1026 ASSERT_EQ(xnn_status_success, status);
1027 ASSERT_NE(nullptr, convolution_op);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001028
1029 // Smart pointer to automatically delete convolution_op.
1030 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1031
1032 ASSERT_EQ(xnn_status_success,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001033 xnn_setup_convolution2d_nhwc_qu8(
XNNPACK Teamb455b122019-09-27 18:10:33 -07001034 convolution_op,
1035 batch_size(), input_height(), input_width(),
1036 input.data(), output.data(),
1037 nullptr /* thread pool */));
1038
1039 ASSERT_EQ(xnn_status_success,
1040 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1041
1042 // Verify results.
1043 for (size_t i = 0; i < batch_size(); i++) {
1044 for (size_t y = 0; y < output_height(); y++) {
1045 for (size_t x = 0; x < output_width(); x++) {
1046 for (size_t g = 0; g < groups(); g++) {
1047 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001048 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07001049 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001050 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07001051 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1052 ASSERT_NEAR(
1053 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001054 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
XNNPACK Teamb455b122019-09-27 18:10:33 -07001055 0.9)
1056 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1057 }
1058 }
1059 }
1060 }
1061 }
1062 }
1063 }
1064
Marat Dukhanefc47b82019-11-18 09:25:38 -08001065 void TestNHWCxF32() const {
Marat Dukhan6989ec42022-01-14 17:14:35 -08001066 ASSERT_EQ(weights_type(), WeightsType::Default);
1067
XNNPACK Teamb455b122019-09-27 18:10:33 -07001068 std::random_device random_device;
1069 auto rng = std::mt19937(random_device());
Marat Dukhan57c78272021-08-10 22:20:20 -07001070 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -07001071
1072 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001073 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
XNNPACK Teamb455b122019-09-27 18:10:33 -07001074 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1075 std::vector<float> bias(groups() * group_output_channels());
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001076 std::vector<float> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
XNNPACK Teamb455b122019-09-27 18:10:33 -07001077 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1078
1079 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1080 std::generate(input.begin(), input.end(), std::ref(f32rng));
1081 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
1082 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
1083 std::fill(output.begin(), output.end(), nanf(""));
1084
1085 // Compute reference results, without clamping.
Marat Dukhanf568f082019-10-30 09:47:07 -07001086 if (has_bias()) {
1087 for (size_t i = 0; i < batch_size(); i++) {
1088 for (size_t oy = 0; oy < output_height(); oy++) {
1089 for (size_t ox = 0; ox < output_width(); ox++) {
1090 for (size_t g = 0; g < groups(); g++) {
1091 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1092 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1093 bias[g * group_output_channels() + oc];
1094 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001095 }
1096 }
1097 }
1098 }
Marat Dukhanf568f082019-10-30 09:47:07 -07001099 } else {
1100 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001101 }
1102 if (depthwise_layout()) {
1103 ASSERT_EQ(group_input_channels(), 1);
1104
1105 for (size_t i = 0; i < batch_size(); i++) {
1106 for (size_t oy = 0; oy < output_height(); oy++) {
1107 for (size_t ox = 0; ox < output_width(); ox++) {
1108 for (size_t ky = 0; ky < kernel_height(); ky++) {
1109 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1110 if (iy < input_height()) {
1111 for (size_t kx = 0; kx < kernel_width(); kx++) {
1112 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1113 if (ix < input_width()) {
1114 for (size_t g = 0; g < groups(); g++) {
1115 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1116 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001117 input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g] *
XNNPACK Teamb455b122019-09-27 18:10:33 -07001118 kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc];
1119 }
1120 }
1121 }
1122 }
1123 }
1124 }
1125 }
1126 }
1127 }
1128 } else {
1129 for (size_t i = 0; i < batch_size(); i++) {
1130 for (size_t oy = 0; oy < output_height(); oy++) {
1131 for (size_t ox = 0; ox < output_width(); ox++) {
1132 for (size_t ky = 0; ky < kernel_height(); ky++) {
1133 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1134 if (iy < input_height()) {
1135 for (size_t kx = 0; kx < kernel_width(); kx++) {
1136 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1137 if (ix < input_width()) {
1138 for (size_t g = 0; g < groups(); g++) {
1139 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1140 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1141 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001142 input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] *
XNNPACK Teamb455b122019-09-27 18:10:33 -07001143 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1144 }
1145 }
1146 }
1147 }
1148 }
1149 }
1150 }
1151 }
1152 }
1153 }
1154 }
1155
1156 // Compute clamping parameters.
1157 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1158 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1159
1160 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
1161 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
1162
1163 // Clamp reference results.
1164 for (float& value : output_ref) {
1165 value = std::max(std::min(value, output_max), output_min);
1166 }
1167
1168 // Create, setup, run, and destroy Convolution operator.
Marat Dukhan04f03be2019-11-19 12:36:47 -08001169 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
XNNPACK Teamb455b122019-09-27 18:10:33 -07001170 xnn_operator_t convolution_op = nullptr;
1171
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001172 xnn_status status = xnn_create_convolution2d_nhwc_f32(
Marat Dukhan8440fde2019-10-24 12:46:13 -07001173 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
1174 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
XNNPACK Teamb455b122019-09-27 18:10:33 -07001175 kernel_height(), kernel_width(),
1176 subsampling_height(), subsampling_width(),
1177 dilation_height(), dilation_width(),
1178 groups(), group_input_channels(), group_output_channels(),
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001179 input_channel_stride(), output_channel_stride(),
Marat Dukhanf568f082019-10-30 09:47:07 -07001180 kernel.data(), has_bias() ? bias.data() : nullptr,
XNNPACK Teamb455b122019-09-27 18:10:33 -07001181 output_min, output_max,
Marat Dukhan8440fde2019-10-24 12:46:13 -07001182 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001183 &convolution_op);
1184 if (status == xnn_status_unsupported_hardware) {
1185 GTEST_SKIP();
1186 }
1187 ASSERT_EQ(xnn_status_success, status);
1188 ASSERT_NE(nullptr, convolution_op);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001189
1190 // Smart pointer to automatically delete convolution_op.
1191 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1192
1193 ASSERT_EQ(xnn_status_success,
1194 xnn_setup_convolution2d_nhwc_f32(
1195 convolution_op,
1196 batch_size(), input_height(), input_width(),
1197 input.data(), output.data(),
1198 nullptr /* thread pool */));
1199
1200 ASSERT_EQ(xnn_status_success,
1201 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1202
1203 // Verify results.
1204 for (size_t i = 0; i < batch_size(); i++) {
1205 for (size_t y = 0; y < output_height(); y++) {
1206 for (size_t x = 0; x < output_width(); x++) {
1207 for (size_t g = 0; g < groups(); g++) {
1208 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001209 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min)
XNNPACK Teamb455b122019-09-27 18:10:33 -07001210 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001211 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max)
XNNPACK Teamb455b122019-09-27 18:10:33 -07001212 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1213 ASSERT_NEAR(
1214 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001215 output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c],
XNNPACK Teamb455b122019-09-27 18:10:33 -07001216 1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
1217 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1218 }
1219 }
1220 }
1221 }
1222 }
1223 }
1224 }
1225
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001226 void TestNHWCxF16() const {
Marat Dukhan6989ec42022-01-14 17:14:35 -08001227 switch (weights_type()) {
1228 case WeightsType::Default:
1229 break;
1230 case WeightsType::FP32:
1231 break;
1232 default:
1233 GTEST_FAIL() << "unexpected weights type";
1234 }
1235
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001236 std::random_device random_device;
1237 auto rng = std::mt19937(random_device());
Marat Dukhan57c78272021-08-10 22:20:20 -07001238 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001239 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
1240
1241 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) +
1242 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
1243 std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
Marat Dukhan6989ec42022-01-14 17:14:35 -08001244 std::vector<float> kernel_as_float(kernel.size());
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001245 std::vector<uint16_t> bias(groups() * group_output_channels());
Marat Dukhan6989ec42022-01-14 17:14:35 -08001246 std::vector<float> bias_as_float(bias.size());
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001247 std::vector<uint16_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
1248 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1249
1250 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1251 std::generate(input.begin(), input.end(), std::ref(f16rng));
1252 std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
Marat Dukhan6989ec42022-01-14 17:14:35 -08001253 std::transform(kernel.cbegin(), kernel.cend(), kernel_as_float.begin(), fp16_ieee_to_fp32_value);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001254 std::generate(bias.begin(), bias.end(), std::ref(f16rng));
Marat Dukhan6989ec42022-01-14 17:14:35 -08001255 std::transform(bias.cbegin(), bias.cend(), bias_as_float.begin(), fp16_ieee_to_fp32_value);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001256 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
1257
Marat Dukhan6989ec42022-01-14 17:14:35 -08001258
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001259 // Compute reference results, without clamping.
1260 if (has_bias()) {
1261 for (size_t i = 0; i < batch_size(); i++) {
1262 for (size_t oy = 0; oy < output_height(); oy++) {
1263 for (size_t ox = 0; ox < output_width(); ox++) {
1264 for (size_t g = 0; g < groups(); g++) {
1265 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1266 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1267 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
1268 }
1269 }
1270 }
1271 }
1272 }
1273 } else {
1274 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1275 }
1276 if (depthwise_layout()) {
1277 ASSERT_EQ(group_input_channels(), 1);
1278
1279 for (size_t i = 0; i < batch_size(); i++) {
1280 for (size_t oy = 0; oy < output_height(); oy++) {
1281 for (size_t ox = 0; ox < output_width(); ox++) {
1282 for (size_t ky = 0; ky < kernel_height(); ky++) {
1283 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1284 if (iy < input_height()) {
1285 for (size_t kx = 0; kx < kernel_width(); kx++) {
1286 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1287 if (ix < input_width()) {
1288 for (size_t g = 0; g < groups(); g++) {
1289 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1290 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1291 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g]) *
1292 fp16_ieee_to_fp32_value(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]);
1293 }
1294 }
1295 }
1296 }
1297 }
1298 }
1299 }
1300 }
1301 }
1302 } else {
1303 for (size_t i = 0; i < batch_size(); i++) {
1304 for (size_t oy = 0; oy < output_height(); oy++) {
1305 for (size_t ox = 0; ox < output_width(); ox++) {
1306 for (size_t ky = 0; ky < kernel_height(); ky++) {
1307 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1308 if (iy < input_height()) {
1309 for (size_t kx = 0; kx < kernel_width(); kx++) {
1310 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1311 if (ix < input_width()) {
1312 for (size_t g = 0; g < groups(); g++) {
1313 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1314 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1315 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1316 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) *
1317 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1318 }
1319 }
1320 }
1321 }
1322 }
1323 }
1324 }
1325 }
1326 }
1327 }
1328 }
1329
1330 // Compute clamping parameters.
1331 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1332 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1333 const float accumulated_range = accumulated_max - accumulated_min;
1334 const float scaled_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
1335 const float scaled_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
1336 const float output_min = scaled_min == scaled_max ? -std::numeric_limits<float>::infinity() : scaled_min;
1337 const float output_max = scaled_min == scaled_max ? +std::numeric_limits<float>::infinity() : scaled_max;
1338
1339 // Clamp reference results.
1340 for (float& value : output_ref) {
1341 value = std::max(std::min(value, output_max), output_min);
1342 }
1343
1344 // Create, setup, run, and destroy Convolution operator.
1345 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1346 xnn_operator_t convolution_op = nullptr;
1347
Marat Dukhan6989ec42022-01-14 17:14:35 -08001348 const void* kernel_data = kernel.data();
1349 const void* bias_data = bias.data();
1350 if (weights_type() == WeightsType::FP32) {
1351 kernel_data = kernel_as_float.data();
1352 bias_data = bias_as_float.data();
1353 }
1354 uint32_t flags = 0;
1355 if (depthwise_layout()) {
1356 flags |= XNN_FLAG_DEPTHWISE_CONVOLUTION;
1357 }
1358 if (padding_tf_same()) {
1359 flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
1360 }
1361 if (weights_type() == WeightsType::FP32) {
1362 flags |= XNN_FLAG_FP32_STATIC_WEIGHTS;
1363 }
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001364 xnn_status status = xnn_create_convolution2d_nhwc_f16(
1365 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
1366 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
1367 kernel_height(), kernel_width(),
1368 subsampling_height(), subsampling_width(),
1369 dilation_height(), dilation_width(),
1370 groups(), group_input_channels(), group_output_channels(),
1371 input_channel_stride(), output_channel_stride(),
Marat Dukhan6989ec42022-01-14 17:14:35 -08001372 kernel_data, has_bias() ? bias_data : nullptr,
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001373 output_min, output_max,
Marat Dukhan6989ec42022-01-14 17:14:35 -08001374 flags,
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001375 &convolution_op);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001376 if (status == xnn_status_unsupported_hardware) {
1377 GTEST_SKIP();
1378 }
1379 ASSERT_EQ(xnn_status_success, status);
1380 ASSERT_NE(nullptr, convolution_op);
1381
1382 // Smart pointer to automatically delete convolution_op.
1383 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1384
1385 ASSERT_EQ(xnn_status_success,
1386 xnn_setup_convolution2d_nhwc_f16(
1387 convolution_op,
1388 batch_size(), input_height(), input_width(),
1389 input.data(), output.data(),
1390 nullptr /* thread pool */));
1391
1392 ASSERT_EQ(xnn_status_success,
1393 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1394
1395 // Verify results.
1396 for (size_t i = 0; i < batch_size(); i++) {
1397 for (size_t y = 0; y < output_height(); y++) {
1398 for (size_t x = 0; x < output_width(); x++) {
1399 for (size_t g = 0; g < groups(); g++) {
1400 for (size_t c = 0; c < group_output_channels(); c++) {
1401// ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min)
1402// << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1403// ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max)
1404// << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Frank Barchard2b9d29b2020-09-17 12:03:39 -07001405 ASSERT_NEAR(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), std::max(1.0e-4f, std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]) * 1.0e-2f))
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001406 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1407 }
1408 }
1409 }
1410 }
1411 }
1412 }
1413 }
1414
Marat Dukhanefc47b82019-11-18 09:25:38 -08001415 void TestNCHWxF32() const {
Marat Dukhan6989ec42022-01-14 17:14:35 -08001416 ASSERT_EQ(weights_type(), WeightsType::Default);
1417
Marat Dukhanefc47b82019-11-18 09:25:38 -08001418 std::random_device random_device;
1419 auto rng = std::mt19937(random_device());
Marat Dukhan57c78272021-08-10 22:20:20 -07001420 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), std::ref(rng));
Marat Dukhanefc47b82019-11-18 09:25:38 -08001421 auto prng = std::bind(std::uniform_real_distribution<float>(), rng);
1422
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001423 std::vector<float> input(2 * XNN_EXTRA_BYTES / sizeof(float) +
1424 ((batch_size() - 1) * input_channel_stride() + groups() * group_input_channels()) * input_height() * input_width());
Marat Dukhanefc47b82019-11-18 09:25:38 -08001425 std::vector<float> kernel(
1426 groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1427 std::vector<float> bias(groups() * group_output_channels());
1428 std::vector<float> output(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001429 ((batch_size() - 1) * output_channel_stride() + groups() * group_output_channels()) * output_height() * output_width());
Marat Dukhanefc47b82019-11-18 09:25:38 -08001430 std::vector<float> output_ref(batch_size() * groups() * group_output_channels() * output_height() * output_width());
1431
1432 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1433 std::generate(input.begin(), input.end(), std::ref(f32rng));
1434 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
1435 for (float& k : kernel) {
1436 if (prng() <= sparsity()) {
1437 k = 0.0f;
1438 }
1439 }
1440 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
1441 std::fill(output.begin(), output.end(), nanf(""));
1442
1443 // Compute reference results, without clamping.
1444 if (has_bias()) {
1445 for (size_t i = 0; i < batch_size(); i++) {
1446 for (size_t oy = 0; oy < output_height(); oy++) {
1447 for (size_t ox = 0; ox < output_width(); ox++) {
1448 for (size_t g = 0; g < groups(); g++) {
1449 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1450 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] =
1451 bias[g * group_output_channels() + oc];
1452 }
1453 }
1454 }
1455 }
1456 }
1457 } else {
1458 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1459 }
1460 if (force_nhwc_input()) {
1461 for (size_t i = 0; i < batch_size(); i++) {
1462 for (size_t oy = 0; oy < output_height(); oy++) {
1463 for (size_t ox = 0; ox < output_width(); ox++) {
1464 for (size_t ky = 0; ky < kernel_height(); ky++) {
1465 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1466 if (iy < input_height()) {
1467 for (size_t kx = 0; kx < kernel_width(); kx++) {
1468 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1469 if (ix < input_width()) {
1470 for (size_t g = 0; g < groups(); g++) {
1471 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1472 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1473 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] +=
1474 input[((((i * input_height() + iy) * input_width() + ix) * groups() + g) * group_input_channels() + ic)] *
1475 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1476 }
1477 }
1478 }
1479 }
1480 }
1481 }
1482 }
1483 }
1484 }
1485 }
Marat Dukhan33032712020-06-18 11:06:04 -07001486 } else if (depthwise_layout()) {
1487 ASSERT_EQ(group_input_channels(), 1);
1488
1489 for (size_t i = 0; i < batch_size(); i++) {
1490 for (size_t oy = 0; oy < output_height(); oy++) {
1491 for (size_t ox = 0; ox < output_width(); ox++) {
1492 for (size_t ky = 0; ky < kernel_height(); ky++) {
1493 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1494 if (iy < input_height()) {
1495 for (size_t kx = 0; kx < kernel_width(); kx++) {
1496 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1497 if (ix < input_width()) {
1498 for (size_t g = 0; g < groups(); g++) {
1499 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1500 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] +=
1501 input[((i * input_channel_stride() + g) * input_height() + iy) * input_width() + ix] *
1502 kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc];
1503 }
1504 }
1505 }
1506 }
1507 }
1508 }
1509 }
1510 }
1511 }
Marat Dukhanefc47b82019-11-18 09:25:38 -08001512 } else {
1513 for (size_t i = 0; i < batch_size(); i++) {
1514 for (size_t oy = 0; oy < output_height(); oy++) {
1515 for (size_t ox = 0; ox < output_width(); ox++) {
1516 for (size_t ky = 0; ky < kernel_height(); ky++) {
1517 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1518 if (iy < input_height()) {
1519 for (size_t kx = 0; kx < kernel_width(); kx++) {
1520 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1521 if (ix < input_width()) {
1522 for (size_t g = 0; g < groups(); g++) {
1523 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1524 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1525 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001526 input[((i * input_channel_stride() + g * group_input_channels() + ic) * input_height() + iy) * input_width() + ix] *
Marat Dukhanefc47b82019-11-18 09:25:38 -08001527 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1528 }
1529 }
1530 }
1531 }
1532 }
1533 }
1534 }
1535 }
1536 }
1537 }
1538 }
1539
1540 // Compute clamping parameters.
1541 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1542 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1543
Marat Dukhan869c62d2020-04-09 17:17:55 -07001544 const float output_min = qmin() == 0 ? -std::numeric_limits<float>::infinity() :
1545 accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
1546 const float output_max = qmax() == 255 ? std::numeric_limits<float>::infinity() :
1547 accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
Marat Dukhanefc47b82019-11-18 09:25:38 -08001548
1549 // Clamp reference results.
1550 for (float& value : output_ref) {
1551 value = std::max(std::min(value, output_max), output_min);
1552 }
1553
1554 // Create, setup, run, and destroy Convolution operator.
Marat Dukhan04f03be2019-11-19 12:36:47 -08001555 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
Marat Dukhanefc47b82019-11-18 09:25:38 -08001556 xnn_operator_t convolution_op = nullptr;
1557
1558 xnn_status status = xnn_create_convolution2d_nchw_f32(
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001559 padding_top(), padding_right(), padding_bottom(), padding_left(),
1560 kernel_height(), kernel_width(),
1561 subsampling_height(), subsampling_width(),
1562 dilation_height(), dilation_width(),
1563 groups(), group_input_channels(), group_output_channels(),
1564 input_channel_stride(), output_channel_stride(),
1565 kernel.data(), has_bias() ? bias.data() : nullptr,
1566 output_min, output_max,
1567 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (force_nhwc_input() ? XNN_FLAG_INPUT_NHWC : 0),
1568 &convolution_op);
Marat Dukhanefc47b82019-11-18 09:25:38 -08001569 if (status == xnn_status_unsupported_parameter) {
1570 GTEST_SKIP();
1571 }
1572 ASSERT_EQ(xnn_status_success, status);
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001573 ASSERT_NE(nullptr, convolution_op);
Marat Dukhanefc47b82019-11-18 09:25:38 -08001574
1575 // Smart pointer to automatically delete convolution_op.
1576 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1577
1578 ASSERT_EQ(xnn_status_success,
1579 xnn_setup_convolution2d_nchw_f32(
1580 convolution_op,
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001581 batch_size(), input_height(), input_width(),
Marat Dukhanefc47b82019-11-18 09:25:38 -08001582 input.data(), output.data(),
1583 nullptr /* thread pool */));
1584
1585 ASSERT_EQ(xnn_status_success,
1586 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1587
1588 // Verify results.
1589 for (size_t i = 0; i < batch_size(); i++) {
1590 for (size_t y = 0; y < output_height(); y++) {
1591 for (size_t x = 0; x < output_width(); x++) {
1592 for (size_t g = 0; g < groups(); g++) {
1593 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001594 ASSERT_GE(output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x], output_min)
Marat Dukhanefc47b82019-11-18 09:25:38 -08001595 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001596 ASSERT_LE(output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x], output_max)
Marat Dukhanefc47b82019-11-18 09:25:38 -08001597 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i;
1598 ASSERT_NEAR(
1599 output_ref[(((i * groups() + g) * group_output_channels() + c) * output_height() + y) * output_width() + x],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001600 output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x],
Marat Dukhanefc47b82019-11-18 09:25:38 -08001601 1.0e-4 * std::abs(output_ref[(((i * groups() + g) * group_output_channels() + c) * output_height() + y) * output_width() + x]))
1602 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i;
1603 }
1604 }
1605 }
1606 }
1607 }
1608 }
1609 }
1610
Marat Dukhan97262462021-06-18 16:14:17 -07001611 void TestSetupNHWCxQC8() const {
Marat Dukhan6989ec42022-01-14 17:14:35 -08001612 ASSERT_EQ(weights_type(), WeightsType::Default);
1613
Marat Dukhan97262462021-06-18 16:14:17 -07001614 ASSERT_FALSE(depthwise_layout());
1615
1616 std::random_device random_device;
1617 auto rng = std::mt19937(random_device());
Marat Dukhan57c78272021-08-10 22:20:20 -07001618 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
Marat Dukhan97262462021-06-18 16:14:17 -07001619 auto i8rng = std::bind(
Marat Dukhan57c78272021-08-10 22:20:20 -07001620 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
1621 std::ref(rng));
1622 auto w8rng = std::bind(
1623 std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()),
1624 std::ref(rng));
Marat Dukhan97262462021-06-18 16:14:17 -07001625
1626 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max(
1627 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
1628 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
1629 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1630 std::vector<int32_t> bias(groups() * group_output_channels());
1631 std::vector<int8_t> output(std::max(
1632 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
1633 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
1634 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1635 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1636 std::vector<float> requantization_scales(groups() * group_output_channels());
1637 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1638 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1639 std::vector<float> next_requantization_scales(groups() * group_output_channels());
1640
1641 const int8_t input_zero_point = -1;
1642 const int8_t output_zero_point = -1;
1643
1644 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1645 std::generate(input.begin(), input.end(), std::ref(i8rng));
Marat Dukhan57c78272021-08-10 22:20:20 -07001646 std::generate(kernel.begin(), kernel.end(), std::ref(w8rng));
Marat Dukhan97262462021-06-18 16:14:17 -07001647 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
1648 std::fill(output.begin(), output.end(), 0xA5);
1649
1650 // Compute reference results, without renormalization.
1651 if (has_bias()) {
1652 for (size_t i = 0; i < batch_size(); i++) {
1653 for (size_t oy = 0; oy < output_height(); oy++) {
1654 for (size_t ox = 0; ox < output_width(); ox++) {
1655 for (size_t g = 0; g < groups(); g++) {
1656 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1657 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1658 bias[g * group_output_channels() + oc];
1659 }
1660 }
1661 }
1662 }
1663 }
1664 } else {
1665 std::fill(accumulators.begin(), accumulators.end(), 0);
1666 }
1667 for (size_t i = 0; i < batch_size(); i++) {
1668 for (size_t oy = 0; oy < output_height(); oy++) {
1669 for (size_t ox = 0; ox < output_width(); ox++) {
1670 for (size_t ky = 0; ky < kernel_height(); ky++) {
1671 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1672 if (iy < input_height()) {
1673 for (size_t kx = 0; kx < kernel_width(); kx++) {
1674 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1675 if (ix < input_width()) {
1676 for (size_t g = 0; g < groups(); g++) {
1677 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1678 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1679 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1680 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1681 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1682 }
1683 }
1684 }
1685 }
1686 }
1687 }
1688 }
1689 }
1690 }
1691 }
1692
1693 // Compute renormalization parameters.
1694 for (size_t c = 0; c < groups() * group_output_channels(); c++) {
1695 int32_t accumulated_min = accumulators[c];
1696 int32_t accumulated_max = accumulators[c];
1697 for (size_t px = 0; px < batch_size() * output_height() * output_width(); px++) {
1698 accumulated_min = std::min(accumulated_min, accumulators[px * groups() * group_output_channels() + c]);
1699 accumulated_max = std::max(accumulated_max, accumulators[px * groups() * group_output_channels() + c]);
1700 }
1701
1702 float requantization_scale = 0x1.0p-32f;
1703 if (accumulated_max != 0) {
1704 requantization_scale = std::max(requantization_scale,
1705 float(int32_t(std::numeric_limits<int8_t>::max()) - int32_t(output_zero_point)) / float(accumulated_max));
1706 }
1707 if (accumulated_min != 0) {
1708 requantization_scale = std::max(requantization_scale,
1709 float(int32_t(std::numeric_limits<int8_t>::min()) - int32_t(output_zero_point)) / float(accumulated_min));
1710 }
1711 requantization_scale = std::min(requantization_scale, 0x1.FFFFFEp-1f);
1712
1713 requantization_scales[c] = requantization_scale;
1714 }
1715
1716 // Renormalize reference results.
1717 for (size_t c = 0; c < groups() * group_output_channels(); c++) {
1718 for (size_t px = 0; px < batch_size() * output_height() * output_width(); px++) {
1719 output_ref[px * groups() * group_output_channels() + c] = double(int32_t(output_zero_point)) +
1720 double(accumulators[px * groups() * group_output_channels() + c]) * double(requantization_scales[c]);
1721 }
1722 }
1723 std::transform(output_ref.cbegin(), output_ref.cend(), output_ref.begin(),
1724 [this](double x) -> double {
1725 return std::max<double>(std::min<double>(x, double(qmax() - 0x80)), double(qmin() - 0x80));
1726 });
1727
1728 // Create, setup, and run Convolution operator once.
1729 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1730 xnn_operator_t convolution_op = nullptr;
1731
1732 xnn_status status = xnn_create_convolution2d_nhwc_qc8(
1733 padding_top(), padding_right(), padding_bottom(), padding_left(),
1734 kernel_height(), kernel_width(),
1735 subsampling_height(), subsampling_width(),
1736 dilation_height(), dilation_width(),
1737 groups(), group_input_channels(), group_output_channels(),
1738 input_channel_stride(), output_channel_stride(),
1739 input_zero_point, 1.0f /* input scale */, requantization_scales.data(),
1740 kernel.data(), has_bias() ? bias.data() : nullptr,
1741 output_zero_point, 1.0f /* output scale */, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
1742 0, &convolution_op);
1743 if (status == xnn_status_unsupported_hardware) {
1744 GTEST_SKIP();
1745 }
1746 ASSERT_EQ(xnn_status_success, status);
1747 ASSERT_NE(nullptr, convolution_op);
1748
1749 // Smart pointer to automatically delete convolution_op.
1750 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1751
1752 ASSERT_EQ(xnn_status_success,
1753 xnn_setup_convolution2d_nhwc_qc8(
1754 convolution_op,
1755 batch_size(), input_height(), input_width(),
1756 input.data(), output.data(),
1757 nullptr /* thread pool */));
1758
1759 ASSERT_EQ(xnn_status_success,
1760 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1761
1762 // Verify results of the first run.
1763 for (size_t i = 0; i < batch_size(); i++) {
1764 for (size_t y = 0; y < output_height(); y++) {
1765 for (size_t x = 0; x < output_width(); x++) {
1766 for (size_t g = 0; g < groups(); g++) {
1767 for (size_t c = 0; c < group_output_channels(); c++) {
1768 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
1769 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1770 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
1771 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1772 ASSERT_NEAR(
1773 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1774 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]),
1775 0.9)
1776 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1777 }
1778 }
1779 }
1780 }
1781 }
1782
1783 // Re-generate data for the second run.
1784 std::generate(input.begin(), input.end(), std::ref(i8rng));
1785 std::fill(output.begin(), output.end(), 0xA5);
1786
1787 // Compute reference results for the second run, including renormalization.
1788 if (has_bias()) {
1789 for (size_t i = 0; i < next_batch_size(); i++) {
1790 for (size_t oy = 0; oy < next_output_height(); oy++) {
1791 for (size_t ox = 0; ox < next_output_width(); ox++) {
1792 for (size_t g = 0; g < groups(); g++) {
1793 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1794 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1795 bias[g * group_output_channels() + oc];
1796 }
1797 }
1798 }
1799 }
1800 }
1801 } else {
1802 std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
1803 }
1804 for (size_t i = 0; i < next_batch_size(); i++) {
1805 for (size_t oy = 0; oy < next_output_height(); oy++) {
1806 for (size_t ox = 0; ox < next_output_width(); ox++) {
1807 for (size_t ky = 0; ky < kernel_height(); ky++) {
1808 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1809 if (iy < next_input_height()) {
1810 for (size_t kx = 0; kx < kernel_width(); kx++) {
1811 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1812 if (ix < next_input_width()) {
1813 for (size_t g = 0; g < groups(); g++) {
1814 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1815 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1816 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1817 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1818 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1819 }
1820 }
1821 }
1822 }
1823 }
1824 }
1825 }
1826 }
1827 }
1828 }
1829 for (size_t c = 0; c < groups() * group_output_channels(); c++) {
1830 for (size_t px = 0; px < next_batch_size() * next_output_height() * next_output_width(); px++) {
1831 next_output_ref[px * groups() * group_output_channels() + c] = double(int32_t(output_zero_point)) +
1832 double(next_accumulators[px * groups() * group_output_channels() + c]) * double(requantization_scales[c]);
1833 }
1834 }
1835 std::transform(next_output_ref.cbegin(), next_output_ref.cend(), next_output_ref.begin(),
1836 [this](double x) -> double {
1837 return std::max<double>(std::min<double>(x, double(qmax() - 0x80)), double(qmin() - 0x80));
1838 });
1839
1840 // Setup and run Convolution operator the second time, and destroy the operator.
1841 ASSERT_EQ(xnn_status_success,
1842 xnn_setup_convolution2d_nhwc_qc8(
1843 convolution_op,
1844 next_batch_size(), next_input_height(), next_input_width(),
1845 input.data(), output.data(),
1846 nullptr /* thread pool */));
1847
1848 ASSERT_EQ(xnn_status_success,
1849 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1850
1851 // Verify results of the second run.
1852 for (size_t i = 0; i < next_batch_size(); i++) {
1853 for (size_t y = 0; y < next_output_height(); y++) {
1854 for (size_t x = 0; x < next_output_width(); x++) {
1855 for (size_t g = 0; g < groups(); g++) {
1856 for (size_t c = 0; c < group_output_channels(); c++) {
1857 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
1858 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1859 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
1860 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1861 ASSERT_NEAR(
1862 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
1863 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]),
1864 0.9)
1865 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1866 }
1867 }
1868 }
1869 }
1870 }
1871 }
1872 }
1873
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001874 void TestSetupNHWCxQS8() const {
Marat Dukhan6989ec42022-01-14 17:14:35 -08001875 ASSERT_EQ(weights_type(), WeightsType::Default);
1876
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001877 ASSERT_FALSE(depthwise_layout());
1878
1879 std::random_device random_device;
1880 auto rng = std::mt19937(random_device());
Marat Dukhan57c78272021-08-10 22:20:20 -07001881 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001882 auto i8rng = std::bind(
Marat Dukhan57c78272021-08-10 22:20:20 -07001883 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()),
1884 std::ref(rng));
1885 auto w8rng = std::bind(
1886 std::uniform_int_distribution<int32_t>(-std::numeric_limits<int8_t>::max(), std::numeric_limits<int8_t>::max()),
1887 std::ref(rng));
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001888
1889 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max(
1890 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
Marat Dukhan97262462021-06-18 16:14:17 -07001891 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001892 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1893 std::vector<int32_t> bias(groups() * group_output_channels());
1894 std::vector<int8_t> output(std::max(
1895 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
1896 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
1897 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1898 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1899 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1900 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1901
1902 const int8_t input_zero_point = -1;
1903
1904 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1905 std::generate(input.begin(), input.end(), std::ref(i8rng));
Marat Dukhan57c78272021-08-10 22:20:20 -07001906 std::generate(kernel.begin(), kernel.end(), std::ref(w8rng));
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001907 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
1908 std::fill(output.begin(), output.end(), 0xA5);
1909
1910 // Compute reference results, without renormalization.
1911 if (has_bias()) {
1912 for (size_t i = 0; i < batch_size(); i++) {
1913 for (size_t oy = 0; oy < output_height(); oy++) {
1914 for (size_t ox = 0; ox < output_width(); ox++) {
1915 for (size_t g = 0; g < groups(); g++) {
1916 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1917 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1918 bias[g * group_output_channels() + oc];
1919 }
1920 }
1921 }
1922 }
1923 }
1924 } else {
1925 std::fill(accumulators.begin(), accumulators.end(), 0);
1926 }
1927 for (size_t i = 0; i < batch_size(); i++) {
1928 for (size_t oy = 0; oy < output_height(); oy++) {
1929 for (size_t ox = 0; ox < output_width(); ox++) {
1930 for (size_t ky = 0; ky < kernel_height(); ky++) {
1931 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1932 if (iy < input_height()) {
1933 for (size_t kx = 0; kx < kernel_width(); kx++) {
1934 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1935 if (ix < input_width()) {
1936 for (size_t g = 0; g < groups(); g++) {
1937 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1938 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1939 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1940 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1941 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1942 }
1943 }
1944 }
1945 }
1946 }
1947 }
1948 }
1949 }
1950 }
1951 }
1952
1953 // Compute renormalization parameters.
1954 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
1955 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
1956
1957 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
1958 const int8_t output_zero_point = int8_t(std::max(std::min(
1959 lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
1960 long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
1961
1962 // Renormalize reference results.
1963 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
1964 [this, output_scale, output_zero_point](int32_t x) -> double {
1965 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
1966 });
1967
1968 // Create, setup, and run Convolution operator once.
1969 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1970 xnn_operator_t convolution_op = nullptr;
1971
1972 xnn_status status = xnn_create_convolution2d_nhwc_qs8(
1973 padding_top(), padding_right(), padding_bottom(), padding_left(),
1974 kernel_height(), kernel_width(),
1975 subsampling_height(), subsampling_width(),
1976 dilation_height(), dilation_width(),
1977 groups(), group_input_channels(), group_output_channels(),
1978 input_channel_stride(), output_channel_stride(),
1979 input_zero_point, 1.0f /* input scale */, 1.0f /* kernel scale */,
1980 kernel.data(), has_bias() ? bias.data() : nullptr,
1981 output_zero_point, output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
1982 0, &convolution_op);
1983 if (status == xnn_status_unsupported_hardware) {
1984 GTEST_SKIP();
1985 }
1986 ASSERT_EQ(xnn_status_success, status);
1987 ASSERT_NE(nullptr, convolution_op);
1988
1989 // Smart pointer to automatically delete convolution_op.
1990 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1991
1992 ASSERT_EQ(xnn_status_success,
1993 xnn_setup_convolution2d_nhwc_qs8(
1994 convolution_op,
1995 batch_size(), input_height(), input_width(),
1996 input.data(), output.data(),
1997 nullptr /* thread pool */));
1998
1999 ASSERT_EQ(xnn_status_success,
2000 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2001
2002 // Verify results of the first run.
2003 for (size_t i = 0; i < batch_size(); i++) {
2004 for (size_t y = 0; y < output_height(); y++) {
2005 for (size_t x = 0; x < output_width(); x++) {
2006 for (size_t g = 0; g < groups(); g++) {
2007 for (size_t c = 0; c < group_output_channels(); c++) {
2008 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
2009 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2010 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
2011 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2012 ASSERT_NEAR(
2013 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
2014 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
2015 0.9)
2016 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2017 }
2018 }
2019 }
2020 }
2021 }
2022
2023 // Re-generate data for the second run.
2024 std::generate(input.begin(), input.end(), std::ref(i8rng));
2025 std::fill(output.begin(), output.end(), 0xA5);
2026
2027 // Compute reference results for the second run, including renormalization.
2028 if (has_bias()) {
2029 for (size_t i = 0; i < next_batch_size(); i++) {
2030 for (size_t oy = 0; oy < next_output_height(); oy++) {
2031 for (size_t ox = 0; ox < next_output_width(); ox++) {
2032 for (size_t g = 0; g < groups(); g++) {
2033 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2034 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2035 bias[g * group_output_channels() + oc];
2036 }
2037 }
2038 }
2039 }
2040 }
2041 } else {
2042 std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
2043 }
2044 for (size_t i = 0; i < next_batch_size(); i++) {
2045 for (size_t oy = 0; oy < next_output_height(); oy++) {
2046 for (size_t ox = 0; ox < next_output_width(); ox++) {
2047 for (size_t ky = 0; ky < kernel_height(); ky++) {
2048 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2049 if (iy < next_input_height()) {
2050 for (size_t kx = 0; kx < kernel_width(); kx++) {
2051 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2052 if (ix < next_input_width()) {
2053 for (size_t g = 0; g < groups(); g++) {
2054 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2055 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2056 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2057 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
2058 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
2059 }
2060 }
2061 }
2062 }
2063 }
2064 }
2065 }
2066 }
2067 }
2068 }
2069 std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
2070 [this, output_scale, output_zero_point](int32_t x) -> double {
2071 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
2072 });
2073
2074 // Setup and run Convolution operator the second time, and destroy the operator.
2075 ASSERT_EQ(xnn_status_success,
2076 xnn_setup_convolution2d_nhwc_qs8(
2077 convolution_op,
2078 next_batch_size(), next_input_height(), next_input_width(),
2079 input.data(), output.data(),
2080 nullptr /* thread pool */));
2081
2082 ASSERT_EQ(xnn_status_success,
2083 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2084
2085 // Verify results of the second run.
2086 for (size_t i = 0; i < next_batch_size(); i++) {
2087 for (size_t y = 0; y < next_output_height(); y++) {
2088 for (size_t x = 0; x < next_output_width(); x++) {
2089 for (size_t g = 0; g < groups(); g++) {
2090 for (size_t c = 0; c < group_output_channels(); c++) {
2091 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
2092 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2093 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
2094 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2095 ASSERT_NEAR(
2096 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
2097 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
2098 0.9)
2099 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2100 }
2101 }
2102 }
2103 }
2104 }
2105 }
2106 }
2107
Marat Dukhan08b7a972020-07-14 18:17:29 -07002108 void TestSetupNHWCxQU8() const {
Marat Dukhan6989ec42022-01-14 17:14:35 -08002109 ASSERT_EQ(weights_type(), WeightsType::Default);
2110
XNNPACK Teamb455b122019-09-27 18:10:33 -07002111 ASSERT_FALSE(depthwise_layout());
2112
2113 std::random_device random_device;
2114 auto rng = std::mt19937(random_device());
Marat Dukhan57c78272021-08-10 22:20:20 -07002115 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), std::ref(rng));
2116 auto u8rng = std::bind(
2117 std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002118
2119 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002120 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
Marat Dukhan97262462021-06-18 16:14:17 -07002121 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002122 std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
2123 std::vector<int32_t> bias(groups() * group_output_channels());
2124 std::vector<uint8_t> output(std::max(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002125 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
2126 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002127 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2128 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2129 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2130 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2131
2132 const uint8_t input_zero_point = 127;
2133 const uint8_t kernel_zero_point = 127;
2134
2135 for (size_t iteration = 0; iteration < iterations(); iteration++) {
2136 std::generate(input.begin(), input.end(), std::ref(u8rng));
2137 std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
Marat Dukhanecd83112020-08-03 21:50:28 -07002138 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002139 std::fill(output.begin(), output.end(), 0xA5);
2140
2141 // Compute reference results, without renormalization.
Marat Dukhanf568f082019-10-30 09:47:07 -07002142 if (has_bias()) {
2143 for (size_t i = 0; i < batch_size(); i++) {
2144 for (size_t oy = 0; oy < output_height(); oy++) {
2145 for (size_t ox = 0; ox < output_width(); ox++) {
2146 for (size_t g = 0; g < groups(); g++) {
2147 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2148 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2149 bias[g * group_output_channels() + oc];
2150 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07002151 }
2152 }
2153 }
2154 }
Marat Dukhanf568f082019-10-30 09:47:07 -07002155 } else {
2156 std::fill(accumulators.begin(), accumulators.end(), 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -07002157 }
2158 for (size_t i = 0; i < batch_size(); i++) {
2159 for (size_t oy = 0; oy < output_height(); oy++) {
2160 for (size_t ox = 0; ox < output_width(); ox++) {
2161 for (size_t ky = 0; ky < kernel_height(); ky++) {
2162 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2163 if (iy < input_height()) {
2164 for (size_t kx = 0; kx < kernel_width(); kx++) {
2165 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2166 if (ix < input_width()) {
2167 for (size_t g = 0; g < groups(); g++) {
2168 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2169 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2170 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002171 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
XNNPACK Teamb455b122019-09-27 18:10:33 -07002172 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
2173 }
2174 }
2175 }
2176 }
2177 }
2178 }
2179 }
2180 }
2181 }
2182 }
2183
2184 // Compute renormalization parameters.
2185 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
2186 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
2187
2188 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
2189 const uint8_t output_zero_point = uint8_t(std::max(std::min(
2190 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
2191 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
2192
2193 // Renormalize reference results.
2194 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
2195 [this, output_scale, output_zero_point](int32_t x) -> double {
2196 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
2197 });
2198
2199 // Create, setup, and run Convolution operator once.
Marat Dukhan04f03be2019-11-19 12:36:47 -08002200 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002201 xnn_operator_t convolution_op = nullptr;
2202
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002203 xnn_status status = xnn_create_convolution2d_nhwc_qu8(
XNNPACK Teamb455b122019-09-27 18:10:33 -07002204 padding_top(), padding_right(), padding_bottom(), padding_left(),
2205 kernel_height(), kernel_width(),
2206 subsampling_height(), subsampling_width(),
2207 dilation_height(), dilation_width(),
2208 groups(), group_input_channels(), group_output_channels(),
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002209 input_channel_stride(), output_channel_stride(),
XNNPACK Teamb455b122019-09-27 18:10:33 -07002210 input_zero_point, 1.0f /* input scale */,
2211 kernel_zero_point, 1.0f /* kernel scale */,
Marat Dukhanf568f082019-10-30 09:47:07 -07002212 kernel.data(), has_bias() ? bias.data() : nullptr,
XNNPACK Teamb455b122019-09-27 18:10:33 -07002213 output_zero_point, output_scale, qmin(), qmax(),
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002214 0, &convolution_op);
2215 if (status == xnn_status_unsupported_hardware) {
2216 GTEST_SKIP();
2217 }
2218 ASSERT_EQ(xnn_status_success, status);
2219 ASSERT_NE(nullptr, convolution_op);
XNNPACK Teamb455b122019-09-27 18:10:33 -07002220
2221 // Smart pointer to automatically delete convolution_op.
2222 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
2223
2224 ASSERT_EQ(xnn_status_success,
Marat Dukhan08b7a972020-07-14 18:17:29 -07002225 xnn_setup_convolution2d_nhwc_qu8(
XNNPACK Teamb455b122019-09-27 18:10:33 -07002226 convolution_op,
2227 batch_size(), input_height(), input_width(),
2228 input.data(), output.data(),
2229 nullptr /* thread pool */));
2230
2231 ASSERT_EQ(xnn_status_success,
2232 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2233
2234 // Verify results of the first run.
2235 for (size_t i = 0; i < batch_size(); i++) {
2236 for (size_t y = 0; y < output_height(); y++) {
2237 for (size_t x = 0; x < output_width(); x++) {
2238 for (size_t g = 0; g < groups(); g++) {
2239 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002240 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07002241 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002242 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07002243 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2244 ASSERT_NEAR(
2245 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002246 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
XNNPACK Teamb455b122019-09-27 18:10:33 -07002247 0.9)
2248 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2249 }
2250 }
2251 }
2252 }
2253 }
2254
2255 // Re-generate data for the second run.
2256 std::generate(input.begin(), input.end(), std::ref(u8rng));
2257 std::fill(output.begin(), output.end(), 0xA5);
2258
2259 // Compute reference results for the second run, including renormalization.
Marat Dukhanf568f082019-10-30 09:47:07 -07002260 if (has_bias()) {
2261 for (size_t i = 0; i < next_batch_size(); i++) {
2262 for (size_t oy = 0; oy < next_output_height(); oy++) {
2263 for (size_t ox = 0; ox < next_output_width(); ox++) {
2264 for (size_t g = 0; g < groups(); g++) {
2265 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2266 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2267 bias[g * group_output_channels() + oc];
2268 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07002269 }
2270 }
2271 }
2272 }
Marat Dukhanf568f082019-10-30 09:47:07 -07002273 } else {
2274 std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -07002275 }
2276 for (size_t i = 0; i < next_batch_size(); i++) {
2277 for (size_t oy = 0; oy < next_output_height(); oy++) {
2278 for (size_t ox = 0; ox < next_output_width(); ox++) {
2279 for (size_t ky = 0; ky < kernel_height(); ky++) {
2280 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2281 if (iy < next_input_height()) {
2282 for (size_t kx = 0; kx < kernel_width(); kx++) {
2283 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2284 if (ix < next_input_width()) {
2285 for (size_t g = 0; g < groups(); g++) {
2286 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2287 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2288 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002289 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
XNNPACK Teamb455b122019-09-27 18:10:33 -07002290 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
2291 }
2292 }
2293 }
2294 }
2295 }
2296 }
2297 }
2298 }
2299 }
2300 }
2301 std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
2302 [this, output_scale, output_zero_point](int32_t x) -> double {
2303 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
2304 });
2305
2306 // Setup and run Convolution operator the second time, and destroy the operator.
2307 ASSERT_EQ(xnn_status_success,
Marat Dukhan08b7a972020-07-14 18:17:29 -07002308 xnn_setup_convolution2d_nhwc_qu8(
XNNPACK Teamb455b122019-09-27 18:10:33 -07002309 convolution_op,
2310 next_batch_size(), next_input_height(), next_input_width(),
2311 input.data(), output.data(),
2312 nullptr /* thread pool */));
2313
2314 ASSERT_EQ(xnn_status_success,
2315 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2316
2317 // Verify results of the second run.
2318 for (size_t i = 0; i < next_batch_size(); i++) {
2319 for (size_t y = 0; y < next_output_height(); y++) {
2320 for (size_t x = 0; x < next_output_width(); x++) {
2321 for (size_t g = 0; g < groups(); g++) {
2322 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002323 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07002324 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002325 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07002326 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2327 ASSERT_NEAR(
2328 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002329 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
XNNPACK Teamb455b122019-09-27 18:10:33 -07002330 0.9)
2331 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2332 }
2333 }
2334 }
2335 }
2336 }
2337 }
2338 }
2339
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002340 void TestSetupNHWCxF16() const {
Marat Dukhan6989ec42022-01-14 17:14:35 -08002341 ASSERT_EQ(weights_type(), WeightsType::Default);
2342
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002343 ASSERT_FALSE(depthwise_layout());
2344
2345 std::random_device random_device;
2346 auto rng = std::mt19937(random_device());
Marat Dukhan57c78272021-08-10 22:20:20 -07002347 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), std::ref(rng));
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002348 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
2349
2350 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max(
2351 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
2352 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
2353 std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
2354 std::vector<uint16_t> bias(groups() * group_output_channels());
2355 std::vector<uint16_t> output(std::max(
2356 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
2357 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
2358 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2359 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2360
2361 for (size_t iteration = 0; iteration < iterations(); iteration++) {
2362 std::generate(input.begin(), input.end(), std::ref(f16rng));
2363 std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
2364 std::generate(bias.begin(), bias.end(), std::ref(f16rng));
2365 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
2366
2367 // Compute reference results, without clamping.
2368 if (has_bias()) {
2369 for (size_t i = 0; i < batch_size(); i++) {
2370 for (size_t oy = 0; oy < output_height(); oy++) {
2371 for (size_t ox = 0; ox < output_width(); ox++) {
2372 for (size_t g = 0; g < groups(); g++) {
2373 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2374 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2375 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
2376 }
2377 }
2378 }
2379 }
2380 }
2381 } else {
2382 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
2383 }
2384 for (size_t i = 0; i < batch_size(); i++) {
2385 for (size_t oy = 0; oy < output_height(); oy++) {
2386 for (size_t ox = 0; ox < output_width(); ox++) {
2387 for (size_t ky = 0; ky < kernel_height(); ky++) {
2388 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2389 if (iy < input_height()) {
2390 for (size_t kx = 0; kx < kernel_width(); kx++) {
2391 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2392 if (ix < input_width()) {
2393 for (size_t g = 0; g < groups(); g++) {
2394 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2395 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2396 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2397 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) *
2398 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
2399 }
2400 }
2401 }
2402 }
2403 }
2404 }
2405 }
2406 }
2407 }
2408 }
2409
2410 // Compute clamping parameters.
2411 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
2412 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
2413 const float accumulated_range = accumulated_max - accumulated_min;
2414 const float scaled_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
2415 const float scaled_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
2416 const float output_min = scaled_min == scaled_max ? -std::numeric_limits<float>::infinity() : scaled_min;
2417 const float output_max = scaled_min == scaled_max ? +std::numeric_limits<float>::infinity() : scaled_max;
2418
2419 for (float& output_value : output_ref) {
2420 output_value = std::min(std::max(output_value, output_min), output_max);
2421 }
2422
2423 // Create, setup, and run Convolution operator once.
2424 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
2425 xnn_operator_t convolution_op = nullptr;
2426
2427 xnn_status status = xnn_create_convolution2d_nhwc_f16(
2428 padding_top(), padding_right(), padding_bottom(), padding_left(),
2429 kernel_height(), kernel_width(),
2430 subsampling_height(), subsampling_width(),
2431 dilation_height(), dilation_width(),
2432 groups(), group_input_channels(), group_output_channels(),
2433 input_channel_stride(), output_channel_stride(),
2434 kernel.data(), has_bias() ? bias.data() : nullptr,
2435 output_min, output_max,
2436 0, &convolution_op);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002437 if (status == xnn_status_unsupported_hardware) {
2438 GTEST_SKIP();
2439 }
2440 ASSERT_EQ(xnn_status_success, status);
2441 ASSERT_NE(nullptr, convolution_op);
2442
2443 // Smart pointer to automatically delete convolution_op.
2444 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
2445
2446 ASSERT_EQ(xnn_status_success,
2447 xnn_setup_convolution2d_nhwc_f16(
2448 convolution_op,
2449 batch_size(), input_height(), input_width(),
2450 input.data(), output.data(),
2451 nullptr /* thread pool */));
2452
2453 ASSERT_EQ(xnn_status_success,
2454 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2455
2456 // Verify results of the first run.
2457 for (size_t i = 0; i < batch_size(); i++) {
2458 for (size_t y = 0; y < output_height(); y++) {
2459 for (size_t x = 0; x < output_width(); x++) {
2460 for (size_t g = 0; g < groups(); g++) {
2461 for (size_t c = 0; c < group_output_channels(); c++) {
2462 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min)
2463 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2464 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max)
2465 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Frank Barchard2b9d29b2020-09-17 12:03:39 -07002466 ASSERT_NEAR(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), std::max(1.0e-4f, std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]) * 1.0e-2f))
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002467 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2468 }
2469 }
2470 }
2471 }
2472 }
2473
2474 // Re-generate data for the second run.
2475 std::generate(input.begin(), input.end(), std::ref(f16rng));
2476 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
2477
2478 // Compute reference results for the second run, including clamping.
2479 if (has_bias()) {
2480 for (size_t i = 0; i < next_batch_size(); i++) {
2481 for (size_t oy = 0; oy < next_output_height(); oy++) {
2482 for (size_t ox = 0; ox < next_output_width(); ox++) {
2483 for (size_t g = 0; g < groups(); g++) {
2484 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2485 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2486 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
2487 }
2488 }
2489 }
2490 }
2491 }
2492 } else {
2493 std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f);
2494 }
2495 for (size_t i = 0; i < next_batch_size(); i++) {
2496 for (size_t oy = 0; oy < next_output_height(); oy++) {
2497 for (size_t ox = 0; ox < next_output_width(); ox++) {
2498 for (size_t ky = 0; ky < kernel_height(); ky++) {
2499 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2500 if (iy < next_input_height()) {
2501 for (size_t kx = 0; kx < kernel_width(); kx++) {
2502 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2503 if (ix < next_input_width()) {
2504 for (size_t g = 0; g < groups(); g++) {
2505 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2506 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2507 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
2508 fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) *
2509 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
2510 }
2511 }
2512 }
2513 }
2514 }
2515 }
2516 }
2517 }
2518 }
2519 }
2520 for (float& value : next_output_ref) {
2521 value = std::max(std::min(value, output_max), output_min);
2522 }
2523
2524 // Setup and run Convolution operator the second time, and destroy the operator.
2525 ASSERT_EQ(xnn_status_success,
2526 xnn_setup_convolution2d_nhwc_f16(
2527 convolution_op,
2528 next_batch_size(), next_input_height(), next_input_width(),
2529 input.data(), output.data(),
2530 nullptr /* thread pool */));
2531
2532 ASSERT_EQ(xnn_status_success,
2533 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2534
2535 // Verify results of the second run.
2536 for (size_t i = 0; i < next_batch_size(); i++) {
2537 for (size_t y = 0; y < next_output_height(); y++) {
2538 for (size_t x = 0; x < next_output_width(); x++) {
2539 for (size_t g = 0; g < groups(); g++) {
2540 for (size_t c = 0; c < group_output_channels(); c++) {
2541 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min)
2542 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2543 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max)
2544 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Frank Barchard2b9d29b2020-09-17 12:03:39 -07002545 ASSERT_NEAR(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c], fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), std::max(1.0e-4f, std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]) * 1.0e-2f))
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002546 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2547 }
2548 }
2549 }
2550 }
2551 }
2552 }
2553 }
2554
Marat Dukhanefc47b82019-11-18 09:25:38 -08002555 void TestSetupNHWCxF32() const {
Marat Dukhan6989ec42022-01-14 17:14:35 -08002556 ASSERT_EQ(weights_type(), WeightsType::Default);
2557
XNNPACK Teamb455b122019-09-27 18:10:33 -07002558 ASSERT_FALSE(depthwise_layout());
2559
2560 std::random_device random_device;
2561 auto rng = std::mt19937(random_device());
Marat Dukhan57c78272021-08-10 22:20:20 -07002562 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), std::ref(rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002563
2564 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + std::max(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002565 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
2566 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002567 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
2568 std::vector<float> bias(groups() * group_output_channels());
2569 std::vector<float> output(std::max(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002570 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
2571 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002572 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2573 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2574
2575 for (size_t iteration = 0; iteration < iterations(); iteration++) {
2576 std::generate(input.begin(), input.end(), std::ref(f32rng));
2577 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
2578 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
2579 std::fill(output.begin(), output.end(), nanf(""));
2580
2581 // Compute reference results, without clamping.
Marat Dukhanf568f082019-10-30 09:47:07 -07002582 if (has_bias()) {
2583 for (size_t i = 0; i < batch_size(); i++) {
2584 for (size_t oy = 0; oy < output_height(); oy++) {
2585 for (size_t ox = 0; ox < output_width(); ox++) {
2586 for (size_t g = 0; g < groups(); g++) {
2587 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2588 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2589 bias[g * group_output_channels() + oc];
2590 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07002591 }
2592 }
2593 }
2594 }
Marat Dukhanf568f082019-10-30 09:47:07 -07002595 } else {
2596 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
XNNPACK Teamb455b122019-09-27 18:10:33 -07002597 }
2598 for (size_t i = 0; i < batch_size(); i++) {
2599 for (size_t oy = 0; oy < output_height(); oy++) {
2600 for (size_t ox = 0; ox < output_width(); ox++) {
2601 for (size_t ky = 0; ky < kernel_height(); ky++) {
2602 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2603 if (iy < input_height()) {
2604 for (size_t kx = 0; kx < kernel_width(); kx++) {
2605 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2606 if (ix < input_width()) {
2607 for (size_t g = 0; g < groups(); g++) {
2608 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2609 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2610 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002611 input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] *
XNNPACK Teamb455b122019-09-27 18:10:33 -07002612 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
2613 }
2614 }
2615 }
2616 }
2617 }
2618 }
2619 }
2620 }
2621 }
2622 }
2623
2624 // Compute clamping parameters.
2625 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
2626 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
2627
2628 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
2629 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
2630
2631 // Clamp reference results.
2632 for (float& value : output_ref) {
2633 value = std::max(std::min(value, output_max), output_min);
2634 }
2635
2636 // Create, setup, and run Convolution operator once.
Marat Dukhan04f03be2019-11-19 12:36:47 -08002637 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002638 xnn_operator_t convolution_op = nullptr;
2639
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002640 xnn_status status = xnn_create_convolution2d_nhwc_f32(
XNNPACK Teamb455b122019-09-27 18:10:33 -07002641 padding_top(), padding_right(), padding_bottom(), padding_left(),
2642 kernel_height(), kernel_width(),
2643 subsampling_height(), subsampling_width(),
2644 dilation_height(), dilation_width(),
2645 groups(), group_input_channels(), group_output_channels(),
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002646 input_channel_stride(), output_channel_stride(),
Marat Dukhanf568f082019-10-30 09:47:07 -07002647 kernel.data(), has_bias() ? bias.data() : nullptr,
XNNPACK Teamb455b122019-09-27 18:10:33 -07002648 output_min, output_max,
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002649 0, &convolution_op);
2650 if (status == xnn_status_unsupported_hardware) {
2651 GTEST_SKIP();
2652 }
2653 ASSERT_EQ(xnn_status_success, status);
2654 ASSERT_NE(nullptr, convolution_op);
XNNPACK Teamb455b122019-09-27 18:10:33 -07002655
2656 // Smart pointer to automatically delete convolution_op.
2657 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
2658
2659 ASSERT_EQ(xnn_status_success,
2660 xnn_setup_convolution2d_nhwc_f32(
2661 convolution_op,
2662 batch_size(), input_height(), input_width(),
2663 input.data(), output.data(),
2664 nullptr /* thread pool */));
2665
2666 ASSERT_EQ(xnn_status_success,
2667 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2668
2669 // Verify results of the first run.
2670 for (size_t i = 0; i < batch_size(); i++) {
2671 for (size_t y = 0; y < output_height(); y++) {
2672 for (size_t x = 0; x < output_width(); x++) {
2673 for (size_t g = 0; g < groups(); g++) {
2674 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002675 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min)
XNNPACK Teamb455b122019-09-27 18:10:33 -07002676 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002677 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max)
XNNPACK Teamb455b122019-09-27 18:10:33 -07002678 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2679 ASSERT_NEAR(
2680 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002681 output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c],
XNNPACK Teamb455b122019-09-27 18:10:33 -07002682 1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
2683 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2684 }
2685 }
2686 }
2687 }
2688 }
2689
2690 // Re-generate data for the second run.
2691 std::generate(input.begin(), input.end(), std::ref(f32rng));
2692 std::fill(output.begin(), output.end(), nanf(""));
2693
2694 // Compute reference results for the second run, including clamping.
Marat Dukhanf568f082019-10-30 09:47:07 -07002695 if (has_bias()) {
2696 for (size_t i = 0; i < next_batch_size(); i++) {
2697 for (size_t oy = 0; oy < next_output_height(); oy++) {
2698 for (size_t ox = 0; ox < next_output_width(); ox++) {
2699 for (size_t g = 0; g < groups(); g++) {
2700 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2701 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2702 bias[g * group_output_channels() + oc];
2703 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07002704 }
2705 }
2706 }
2707 }
Marat Dukhanf568f082019-10-30 09:47:07 -07002708 } else {
2709 std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f);
XNNPACK Teamb455b122019-09-27 18:10:33 -07002710 }
2711 for (size_t i = 0; i < next_batch_size(); i++) {
2712 for (size_t oy = 0; oy < next_output_height(); oy++) {
2713 for (size_t ox = 0; ox < next_output_width(); ox++) {
2714 for (size_t ky = 0; ky < kernel_height(); ky++) {
2715 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2716 if (iy < next_input_height()) {
2717 for (size_t kx = 0; kx < kernel_width(); kx++) {
2718 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2719 if (ix < next_input_width()) {
2720 for (size_t g = 0; g < groups(); g++) {
2721 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2722 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2723 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002724 input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] *
XNNPACK Teamb455b122019-09-27 18:10:33 -07002725 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
2726 }
2727 }
2728 }
2729 }
2730 }
2731 }
2732 }
2733 }
2734 }
2735 }
2736 for (float& value : next_output_ref) {
2737 value = std::max(std::min(value, output_max), output_min);
2738 }
2739
2740 // Setup and run Convolution operator the second time, and destroy the operator.
2741 ASSERT_EQ(xnn_status_success,
2742 xnn_setup_convolution2d_nhwc_f32(
2743 convolution_op,
2744 next_batch_size(), next_input_height(), next_input_width(),
2745 input.data(), output.data(),
2746 nullptr /* thread pool */));
2747
2748 ASSERT_EQ(xnn_status_success,
2749 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2750
2751 // Verify results of the second run.
2752 for (size_t i = 0; i < next_batch_size(); i++) {
2753 for (size_t y = 0; y < next_output_height(); y++) {
2754 for (size_t x = 0; x < next_output_width(); x++) {
2755 for (size_t g = 0; g < groups(); g++) {
2756 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002757 ASSERT_GE(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min)
XNNPACK Teamb455b122019-09-27 18:10:33 -07002758 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002759 ASSERT_LE(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max)
XNNPACK Teamb455b122019-09-27 18:10:33 -07002760 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2761 ASSERT_NEAR(
2762 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002763 output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c],
XNNPACK Teamb455b122019-09-27 18:10:33 -07002764 1.0e-4 * std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]))
2765 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2766 }
2767 }
2768 }
2769 }
2770 }
2771 }
2772 }
2773
2774 private:
2775 uint32_t padding_top_{0};
2776 uint32_t padding_right_{0};
2777 uint32_t padding_bottom_{0};
2778 uint32_t padding_left_{0};
Marat Dukhan8440fde2019-10-24 12:46:13 -07002779 bool padding_tf_same_{false};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002780 size_t input_height_{1};
2781 size_t input_width_{1};
2782 uint32_t groups_{1};
2783 size_t group_input_channels_{1};
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002784 size_t input_channel_stride_{0};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002785 size_t group_output_channels_{1};
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002786 size_t output_channel_stride_{0};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002787 size_t batch_size_{1};
2788 uint32_t kernel_height_{1};
2789 uint32_t kernel_width_{1};
2790 uint32_t dilation_height_{1};
2791 uint32_t dilation_width_{1};
2792 uint32_t subsampling_height_{1};
2793 uint32_t subsampling_width_{1};
2794 size_t next_input_height_{0};
2795 size_t next_input_width_{0};
2796 size_t next_batch_size_{0};
Marat Dukhanefc47b82019-11-18 09:25:38 -08002797 float sparsity_{0.0f};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002798 uint8_t qmin_{0};
2799 uint8_t qmax_{255};
2800 bool depthwise_layout_{false};
Marat Dukhanefc47b82019-11-18 09:25:38 -08002801 bool force_nhwc_input_{false};
Marat Dukhanf568f082019-10-30 09:47:07 -07002802 bool has_bias_{true};
Marat Dukhan6989ec42022-01-14 17:14:35 -08002803 WeightsType weights_type_{WeightsType::Default};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002804 size_t iterations_{1};
2805};