blob: 975b38bb6af0400ae94c0eac8345999ace570d63 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#pragma once
10
11#include <gtest/gtest.h>
12
13#include <algorithm>
14#include <cassert>
15#include <cmath>
16#include <cstddef>
17#include <cstdlib>
18#include <functional>
Marat Dukhan5ce30d92020-04-14 03:31:26 -070019#include <limits>
XNNPACK Teamb455b122019-09-27 18:10:33 -070020#include <random>
21#include <vector>
22
Frank Barchard49b4dcc2020-06-26 14:07:19 -070023#include <fp16.h>
24
XNNPACK Teamb455b122019-09-27 18:10:33 -070025#include <xnnpack.h>
26
27
28class ConvolutionOperatorTester {
29 public:
Marat Dukhan8440fde2019-10-24 12:46:13 -070030 inline ConvolutionOperatorTester& padding_tf_same(bool padding_same) {
31 if (padding_same) {
32 assert(padding_top() == 0);
33 assert(padding_left() == 0);
34 assert(padding_bottom() == 0);
35 assert(padding_right() == 0);
36 }
37 this->padding_tf_same_ = padding_same;
38 return *this;
39 }
40
41 inline bool padding_tf_same() const {
42 return this->padding_tf_same_;
43 }
44
XNNPACK Teamb455b122019-09-27 18:10:33 -070045 inline ConvolutionOperatorTester& padding(uint32_t padding) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070046 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070047 this->padding_top_ = padding;
48 this->padding_right_ = padding;
49 this->padding_bottom_ = padding;
50 this->padding_left_ = padding;
51 return *this;
52 }
53
54 inline ConvolutionOperatorTester& padding(uint32_t padding_height, uint32_t padding_width) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070055 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070056 this->padding_top_ = padding_height;
57 this->padding_right_ = padding_width;
58 this->padding_bottom_ = padding_height;
59 this->padding_left_ = padding_width;
60 return *this;
61 }
62
63 inline ConvolutionOperatorTester& padding_height(uint32_t padding_height) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070064 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 this->padding_top_ = padding_height;
66 this->padding_bottom_ = padding_height;
67 return *this;
68 }
69
70 inline ConvolutionOperatorTester& padding_width(uint32_t padding_width) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070071 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070072 this->padding_right_ = padding_width;
73 this->padding_left_ = padding_width;
74 return *this;
75 }
76
77 inline ConvolutionOperatorTester& padding_top(uint32_t padding_top) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070078 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070079 this->padding_top_ = padding_top;
80 return *this;
81 }
82
83 inline uint32_t padding_top() const {
Marat Dukhan8440fde2019-10-24 12:46:13 -070084 if (padding_tf_same()) {
85 const uint32_t total_padding_height =
86 (output_height() - 1) * subsampling_height() + dilated_kernel_height() - input_height();
87 return total_padding_height / 2;
88 } else {
89 return this->padding_top_;
90 }
XNNPACK Teamb455b122019-09-27 18:10:33 -070091 }
92
93 inline ConvolutionOperatorTester& padding_left(uint32_t padding_left) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070094 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070095 this->padding_left_ = padding_left;
96 return *this;
97 }
98
99 inline uint32_t padding_left() const {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700100 if (padding_tf_same()) {
101 const uint32_t total_padding_width =
102 (output_width() - 1) * subsampling_width() + dilated_kernel_width() - input_width();
103 return total_padding_width / 2;
104 } else {
105 return this->padding_left_;
106 }
107 }
108
109 inline ConvolutionOperatorTester& padding_bottom(uint32_t padding_bottom) {
110 assert(!padding_tf_same());
111 this->padding_bottom_ = padding_bottom;
112 return *this;
113 }
114
115 inline uint32_t padding_bottom() const {
116 if (padding_tf_same()) {
117 const uint32_t total_padding_height =
118 (output_height() - 1) * subsampling_height() + dilated_kernel_height() - input_height();
119 return total_padding_height - total_padding_height / 2;
120 } else {
121 return this->padding_bottom_;
122 }
123 }
124
125 inline ConvolutionOperatorTester& padding_right(uint32_t padding_right) {
126 assert(!padding_tf_same());
127 this->padding_right_ = padding_right;
128 return *this;
129 }
130
131 inline uint32_t padding_right() const {
132 if (padding_tf_same()) {
133 const uint32_t total_padding_width =
134 (output_width() - 1) * subsampling_width() + dilated_kernel_width() - input_width();
135 return total_padding_width - total_padding_width / 2;
136 } else {
137 return this->padding_right_;
138 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700139 }
140
141 inline ConvolutionOperatorTester& input_size(uint32_t input_height, uint32_t input_width) {
142 assert(input_height >= 1);
143 assert(input_width >= 1);
144 this->input_height_ = input_height;
145 this->input_width_ = input_width;
146 return *this;
147 }
148
149 inline ConvolutionOperatorTester& input_height(uint32_t input_height) {
150 assert(input_height >= 1);
151 this->input_height_ = input_height;
152 return *this;
153 }
154
155 inline uint32_t input_height() const {
156 return this->input_height_;
157 }
158
159 inline ConvolutionOperatorTester& input_width(uint32_t input_width) {
160 assert(input_width >= 1);
161 this->input_width_ = input_width;
162 return *this;
163 }
164
165 inline uint32_t input_width() const {
166 return this->input_width_;
167 }
168
169 inline ConvolutionOperatorTester& groups(uint32_t groups) {
170 assert(groups >= 1);
171 this->groups_ = groups;
172 return *this;
173 }
174
175 inline uint32_t groups() const {
176 return this->groups_;
177 }
178
179 inline ConvolutionOperatorTester& group_input_channels(size_t group_input_channels) {
180 assert(group_input_channels >= 1);
181 this->group_input_channels_ = group_input_channels;
182 return *this;
183 }
184
185 inline size_t group_input_channels() const {
186 return this->group_input_channels_;
187 }
188
189 inline ConvolutionOperatorTester& group_output_channels(size_t group_output_channels) {
190 assert(group_output_channels >= 1);
191 this->group_output_channels_ = group_output_channels;
192 return *this;
193 }
194
195 inline size_t group_output_channels() const {
196 return this->group_output_channels_;
197 }
198
199 inline ConvolutionOperatorTester& batch_size(size_t batch_size) {
200 assert(batch_size >= 1);
201 this->batch_size_ = batch_size;
202 return *this;
203 }
204
205 inline size_t batch_size() const {
206 return this->batch_size_;
207 }
208
209 inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_size) {
210 assert(kernel_size >= 1);
211 this->kernel_height_ = kernel_size;
212 this->kernel_width_ = kernel_size;
213 return *this;
214 }
215
216 inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_height, uint32_t kernel_width) {
217 assert(kernel_height >= 1);
218 assert(kernel_width >= 1);
219 this->kernel_height_ = kernel_height;
220 this->kernel_width_ = kernel_width;
221 return *this;
222 }
223
224 inline ConvolutionOperatorTester& kernel_height(uint32_t kernel_height) {
225 assert(kernel_height >= 1);
226 this->kernel_height_ = kernel_height;
227 return *this;
228 }
229
230 inline uint32_t kernel_height() const {
231 return this->kernel_height_;
232 }
233
234 inline ConvolutionOperatorTester& kernel_width(uint32_t kernel_width) {
235 assert(kernel_width >= 1);
236 this->kernel_width_ = kernel_width;
237 return *this;
238 }
239
240 inline uint32_t kernel_width() const {
241 return this->kernel_width_;
242 }
243
244 inline ConvolutionOperatorTester& dilation(uint32_t dilation) {
245 assert(dilation >= 1);
246 this->dilation_height_ = dilation;
247 this->dilation_width_ = dilation;
248 return *this;
249 }
250
251 inline ConvolutionOperatorTester& dilation(uint32_t dilation_height, uint32_t dilation_width) {
252 assert(dilation_height >= 1);
253 assert(dilation_width >= 1);
254 this->dilation_height_ = dilation_height;
255 this->dilation_width_ = dilation_width;
256 return *this;
257 }
258
259 inline ConvolutionOperatorTester& dilation_height(uint32_t dilation_height) {
260 assert(dilation_height >= 1);
261 this->dilation_height_ = dilation_height;
262 return *this;
263 }
264
265 inline uint32_t dilation_height() const {
266 return this->dilation_height_;
267 }
268
269 inline ConvolutionOperatorTester& dilation_width(uint32_t dilation_width) {
270 assert(dilation_width >= 1);
271 this->dilation_width_ = dilation_width;
272 return *this;
273 }
274
275 inline uint32_t dilation_width() const {
276 return this->dilation_width_;
277 }
278
279 inline ConvolutionOperatorTester& subsampling(uint32_t subsampling) {
280 assert(subsampling >= 1);
281 this->subsampling_height_ = subsampling;
282 this->subsampling_width_ = subsampling;
283 return *this;
284 }
285
286 inline ConvolutionOperatorTester& subsampling(uint32_t subsampling_height, uint32_t subsampling_width) {
287 assert(subsampling_height >= 1);
288 assert(subsampling_width >= 1);
289 this->subsampling_height_ = subsampling_height;
290 this->subsampling_width_ = subsampling_width;
291 return *this;
292 }
293
294 inline ConvolutionOperatorTester& subsampling_height(uint32_t subsampling_height) {
295 assert(subsampling_height >= 1);
296 this->subsampling_height_ = subsampling_height;
297 return *this;
298 }
299
300 inline uint32_t subsampling_height() const {
301 return this->subsampling_height_;
302 }
303
304 inline ConvolutionOperatorTester& subsampling_width(uint32_t subsampling_width) {
305 assert(subsampling_width >= 1);
306 this->subsampling_width_ = subsampling_width;
307 return *this;
308 }
309
310 inline uint32_t subsampling_width() const {
311 return this->subsampling_width_;
312 }
313
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700314 inline ConvolutionOperatorTester& input_channel_stride(size_t input_channel_stride) {
315 assert(input_channel_stride >= 1);
316 this->input_channel_stride_ = input_channel_stride;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700317 return *this;
318 }
319
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700320 inline size_t input_channel_stride() const {
321 if (this->input_channel_stride_ == 0) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700322 return group_input_channels() * groups();
323 } else {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700324 assert(this->input_channel_stride_ >= group_input_channels() * groups());
325 return this->input_channel_stride_;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700326 }
327 }
328
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700329 inline ConvolutionOperatorTester& output_channel_stride(size_t output_channel_stride) {
330 assert(output_channel_stride >= 1);
331 this->output_channel_stride_ = output_channel_stride;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700332 return *this;
333 }
334
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700335 inline size_t output_channel_stride() const {
336 if (this->output_channel_stride_ == 0) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700337 return group_output_channels() * groups();
338 } else {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700339 assert(this->output_channel_stride_ >= group_output_channels() * groups());
340 return this->output_channel_stride_;
Marat Dukhanefc47b82019-11-18 09:25:38 -0800341 }
342 }
343
XNNPACK Teamb455b122019-09-27 18:10:33 -0700344 inline uint32_t dilated_kernel_height() const {
345 return (kernel_height() - 1) * dilation_height() + 1;
346 }
347
348 inline uint32_t dilated_kernel_width() const {
349 return (kernel_width() - 1) * dilation_width() + 1;
350 }
351
352 inline size_t output_height() const {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700353 if (padding_tf_same()) {
354 return (input_height() + subsampling_height() - 1) / subsampling_height();
XNNPACK Teamb455b122019-09-27 18:10:33 -0700355 } else {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700356 const size_t padded_input_height = padding_top() + input_height() + padding_bottom();
357 if (padded_input_height <= dilated_kernel_height()) {
358 return 1;
359 } else {
360 return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1;
361 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700362 }
363 }
364
365 inline size_t output_width() const {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700366 if (padding_tf_same()) {
367 return (input_width() + subsampling_width() - 1) / subsampling_width();
XNNPACK Teamb455b122019-09-27 18:10:33 -0700368 } else {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700369 const size_t padded_input_width = padding_left() + input_width() + padding_right();
370 if (padded_input_width <= dilated_kernel_width()) {
371 return 1;
372 } else {
373 return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1;
374 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700375 }
376 }
377
378 inline ConvolutionOperatorTester& next_input_size(uint32_t next_input_height, uint32_t next_input_width) {
379 assert(next_input_height >= 1);
380 assert(next_input_width >= 1);
381 this->next_input_height_ = next_input_height;
382 this->next_input_width_ = next_input_width;
383 return *this;
384 }
385
386 inline ConvolutionOperatorTester& next_input_height(uint32_t next_input_height) {
387 assert(next_input_height >= 1);
388 this->next_input_height_ = next_input_height;
389 return *this;
390 }
391
392 inline uint32_t next_input_height() const {
393 if (this->next_input_height_ == 0) {
394 return input_height();
395 } else {
396 return this->next_input_height_;
397 }
398 }
399
400 inline ConvolutionOperatorTester& next_input_width(uint32_t next_input_width) {
401 assert(next_input_width >= 1);
402 this->next_input_width_ = next_input_width;
403 return *this;
404 }
405
406 inline uint32_t next_input_width() const {
407 if (this->next_input_width_ == 0) {
408 return input_width();
409 } else {
410 return this->next_input_width_;
411 }
412 }
413
414 inline size_t next_output_height() const {
415 const size_t padded_input_height = padding_top() + next_input_height() + padding_bottom();
416 if (padded_input_height <= dilated_kernel_height()) {
417 return 1;
418 } else {
419 return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1;
420 }
421 }
422
423 inline size_t next_output_width() const {
424 const size_t padded_input_width = padding_left() + next_input_width() + padding_right();
425 if (padded_input_width <= dilated_kernel_width()) {
426 return 1;
427 } else {
428 return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1;
429 }
430 }
431
432 inline ConvolutionOperatorTester& next_batch_size(size_t next_batch_size) {
433 assert(next_batch_size >= 1);
434 this->next_batch_size_ = next_batch_size;
435 return *this;
436 }
437
438 inline size_t next_batch_size() const {
439 if (this->next_batch_size_ == 0) {
440 return batch_size();
441 } else {
442 return this->next_batch_size_;
443 }
444 }
445
Marat Dukhanefc47b82019-11-18 09:25:38 -0800446 inline ConvolutionOperatorTester& sparsity(float sparsity) {
447 this->sparsity_ = sparsity;
448 return *this;
449 }
450
451 inline float sparsity() const {
452 return this->sparsity_;
453 }
454
XNNPACK Teamb455b122019-09-27 18:10:33 -0700455 inline ConvolutionOperatorTester& qmin(uint8_t qmin) {
456 this->qmin_ = qmin;
457 return *this;
458 }
459
460 inline uint8_t qmin() const {
461 return this->qmin_;
462 }
463
464 inline ConvolutionOperatorTester& qmax(uint8_t qmax) {
465 this->qmax_ = qmax;
466 return *this;
467 }
468
469 inline uint8_t qmax() const {
470 return this->qmax_;
471 }
472
Marat Dukhanefc47b82019-11-18 09:25:38 -0800473 inline ConvolutionOperatorTester& force_nhwc_input(bool force_nhwc_input) {
474 this->force_nhwc_input_ = force_nhwc_input;
475 return *this;
476 }
477
478 inline bool force_nhwc_input() const {
479 return this->force_nhwc_input_;
480 }
481
XNNPACK Teamb455b122019-09-27 18:10:33 -0700482 inline ConvolutionOperatorTester& depthwise_layout(bool depthwise_layout) {
483 this->depthwise_layout_ = depthwise_layout;
484 return *this;
485 }
486
487 inline bool depthwise_layout() const {
488 return this->depthwise_layout_;
489 }
490
Marat Dukhanf568f082019-10-30 09:47:07 -0700491 inline ConvolutionOperatorTester& has_bias(bool has_bias) {
492 this->has_bias_ = has_bias;
493 return *this;
494 }
495
496 inline bool has_bias() const {
497 return this->has_bias_;
498 }
499
XNNPACK Teamb455b122019-09-27 18:10:33 -0700500 inline ConvolutionOperatorTester& iterations(size_t iterations) {
501 this->iterations_ = iterations;
502 return *this;
503 }
504
505 inline size_t iterations() const {
506 return this->iterations_;
507 }
508
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700509 void TestNHWCxQS8() const {
510 std::random_device random_device;
511 auto rng = std::mt19937(random_device());
512 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
513 auto i8rng = std::bind(
514 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), rng);
515
516 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) +
517 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()) + 8);
518 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
519 std::vector<int32_t> bias(groups() * group_output_channels());
520 std::vector<int8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
521 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
522 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
523
524 const int8_t input_zero_point = -1;
525
526 for (size_t iteration = 0; iteration < iterations(); iteration++) {
527 std::generate(input.begin(), input.end(), std::ref(i8rng));
528 std::generate(kernel.begin(), kernel.end(), std::ref(i8rng));
529 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
530 std::fill(output.begin(), output.end(), 0xA5);
531
532 // Compute reference results, without renormalization.
533 if (has_bias()) {
534 for (size_t i = 0; i < batch_size(); i++) {
535 for (size_t oy = 0; oy < output_height(); oy++) {
536 for (size_t ox = 0; ox < output_width(); ox++) {
537 for (size_t g = 0; g < groups(); g++) {
538 for (size_t oc = 0; oc < group_output_channels(); oc++) {
539 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
540 bias[g * group_output_channels() + oc];
541 }
542 }
543 }
544 }
545 }
546 } else {
547 std::fill(accumulators.begin(), accumulators.end(), 0);
548 }
549 if (depthwise_layout()) {
550 ASSERT_EQ(group_input_channels(), 1);
551
552 for (size_t i = 0; i < batch_size(); i++) {
553 for (size_t oy = 0; oy < output_height(); oy++) {
554 for (size_t ox = 0; ox < output_width(); ox++) {
555 for (size_t ky = 0; ky < kernel_height(); ky++) {
556 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
557 if (iy < input_height()) {
558 for (size_t kx = 0; kx < kernel_width(); kx++) {
559 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
560 if (ix < input_width()) {
561 for (size_t g = 0; g < groups(); g++) {
562 for (size_t oc = 0; oc < group_output_channels(); oc++) {
563 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
564 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g]) - int32_t(input_zero_point)) *
565 int32_t(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]);
566 }
567 }
568 }
569 }
570 }
571 }
572 }
573 }
574 }
575 } else {
576 for (size_t i = 0; i < batch_size(); i++) {
577 for (size_t oy = 0; oy < output_height(); oy++) {
578 for (size_t ox = 0; ox < output_width(); ox++) {
579 for (size_t ky = 0; ky < kernel_height(); ky++) {
580 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
581 if (iy < input_height()) {
582 for (size_t kx = 0; kx < kernel_width(); kx++) {
583 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
584 if (ix < input_width()) {
585 for (size_t g = 0; g < groups(); g++) {
586 for (size_t oc = 0; oc < group_output_channels(); oc++) {
587 for (size_t ic = 0; ic < group_input_channels(); ic++) {
588 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
589 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
590 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
591 }
592 }
593 }
594 }
595 }
596 }
597 }
598 }
599 }
600 }
601 }
602
603 // Compute renormalization parameters.
604 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
605 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
606
607 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
608 const int8_t output_zero_point = int8_t(std::max(std::min(
609 lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
610 long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
611
612 // Renormalize reference results.
613 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
614 [this, output_scale, output_zero_point](int32_t x) -> double {
615 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
616 });
617
618 // Create, setup, run, and destroy Convolution operator.
619 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
620 xnn_operator_t convolution_op = nullptr;
621
622 xnn_status status = xnn_create_convolution2d_nhwc_qs8(
623 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
624 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
625 kernel_height(), kernel_width(),
626 subsampling_height(), subsampling_width(),
627 dilation_height(), dilation_width(),
628 groups(), group_input_channels(), group_output_channels(),
629 input_channel_stride(), output_channel_stride(),
630 input_zero_point, 1.0f /* input scale */, 1.0f /* kernel scale */,
631 kernel.data(), has_bias() ? bias.data() : nullptr,
632 output_zero_point, output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
633 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
634 &convolution_op);
635 if (status == xnn_status_unsupported_hardware) {
636 GTEST_SKIP();
637 }
638 ASSERT_EQ(xnn_status_success, status);
639 ASSERT_NE(nullptr, convolution_op);
640
641 // Smart pointer to automatically delete convolution_op.
642 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
643
644 ASSERT_EQ(xnn_status_success,
645 xnn_setup_convolution2d_nhwc_qs8(
646 convolution_op,
647 batch_size(), input_height(), input_width(),
648 input.data(), output.data(),
649 nullptr /* thread pool */));
650
651 ASSERT_EQ(xnn_status_success,
652 xnn_run_operator(convolution_op, nullptr /* thread pool */));
653
654 // Verify results.
655 for (size_t i = 0; i < batch_size(); i++) {
656 for (size_t y = 0; y < output_height(); y++) {
657 for (size_t x = 0; x < output_width(); x++) {
658 for (size_t g = 0; g < groups(); g++) {
659 for (size_t c = 0; c < group_output_channels(); c++) {
660 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
661 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
662 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
663 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
664 ASSERT_NEAR(
665 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
666 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
667 0.9)
668 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
669 }
670 }
671 }
672 }
673 }
674 }
675 }
676
Marat Dukhan08b7a972020-07-14 18:17:29 -0700677 void TestNHWCxQU8() const {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700678 std::random_device random_device;
679 auto rng = std::mt19937(random_device());
Marat Dukhanecd83112020-08-03 21:50:28 -0700680 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
Marat Dukhan5ce30d92020-04-14 03:31:26 -0700681 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700682
683 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700684 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()) + 8);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700685 std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
686 std::vector<int32_t> bias(groups() * group_output_channels());
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700687 std::vector<uint8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700688 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
689 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
690
691 const uint8_t input_zero_point = 127;
692 const uint8_t kernel_zero_point = 127;
693
694 for (size_t iteration = 0; iteration < iterations(); iteration++) {
695 std::generate(input.begin(), input.end(), std::ref(u8rng));
696 std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
Marat Dukhanecd83112020-08-03 21:50:28 -0700697 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700698 std::fill(output.begin(), output.end(), 0xA5);
699
700 // Compute reference results, without renormalization.
Marat Dukhanf568f082019-10-30 09:47:07 -0700701 if (has_bias()) {
702 for (size_t i = 0; i < batch_size(); i++) {
703 for (size_t oy = 0; oy < output_height(); oy++) {
704 for (size_t ox = 0; ox < output_width(); ox++) {
705 for (size_t g = 0; g < groups(); g++) {
706 for (size_t oc = 0; oc < group_output_channels(); oc++) {
707 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
708 bias[g * group_output_channels() + oc];
709 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700710 }
711 }
712 }
713 }
Marat Dukhanf568f082019-10-30 09:47:07 -0700714 } else {
715 std::fill(accumulators.begin(), accumulators.end(), 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700716 }
717 if (depthwise_layout()) {
718 ASSERT_EQ(group_input_channels(), 1);
719
720 for (size_t i = 0; i < batch_size(); i++) {
721 for (size_t oy = 0; oy < output_height(); oy++) {
722 for (size_t ox = 0; ox < output_width(); ox++) {
723 for (size_t ky = 0; ky < kernel_height(); ky++) {
724 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
725 if (iy < input_height()) {
726 for (size_t kx = 0; kx < kernel_width(); kx++) {
727 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
728 if (ix < input_width()) {
729 for (size_t g = 0; g < groups(); g++) {
730 for (size_t oc = 0; oc < group_output_channels(); oc++) {
731 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700732 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g]) - int32_t(input_zero_point)) *
XNNPACK Teamb455b122019-09-27 18:10:33 -0700733 (int32_t(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]) - int32_t(kernel_zero_point));
734 }
735 }
736 }
737 }
738 }
739 }
740 }
741 }
742 }
743 } else {
744 for (size_t i = 0; i < batch_size(); i++) {
745 for (size_t oy = 0; oy < output_height(); oy++) {
746 for (size_t ox = 0; ox < output_width(); ox++) {
747 for (size_t ky = 0; ky < kernel_height(); ky++) {
748 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
749 if (iy < input_height()) {
750 for (size_t kx = 0; kx < kernel_width(); kx++) {
751 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
752 if (ix < input_width()) {
753 for (size_t g = 0; g < groups(); g++) {
754 for (size_t oc = 0; oc < group_output_channels(); oc++) {
755 for (size_t ic = 0; ic < group_input_channels(); ic++) {
756 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700757 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
XNNPACK Teamb455b122019-09-27 18:10:33 -0700758 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
759 }
760 }
761 }
762 }
763 }
764 }
765 }
766 }
767 }
768 }
769 }
770
771 // Compute renormalization parameters.
772 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
773 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
774
775 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
776 const uint8_t output_zero_point = uint8_t(std::max(std::min(
777 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
778 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
779
780 // Renormalize reference results.
781 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
782 [this, output_scale, output_zero_point](int32_t x) -> double {
783 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
784 });
785
786 // Create, setup, run, and destroy Convolution operator.
Marat Dukhan04f03be2019-11-19 12:36:47 -0800787 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700788 xnn_operator_t convolution_op = nullptr;
789
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700790 xnn_status status = xnn_create_convolution2d_nhwc_qu8(
Marat Dukhan8440fde2019-10-24 12:46:13 -0700791 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
792 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700793 kernel_height(), kernel_width(),
794 subsampling_height(), subsampling_width(),
795 dilation_height(), dilation_width(),
796 groups(), group_input_channels(), group_output_channels(),
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700797 input_channel_stride(), output_channel_stride(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700798 input_zero_point, 1.0f /* input scale */,
799 kernel_zero_point, 1.0f /* kernel scale */,
Marat Dukhanf568f082019-10-30 09:47:07 -0700800 kernel.data(), has_bias() ? bias.data() : nullptr,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700801 output_zero_point, output_scale, qmin(), qmax(),
Marat Dukhan8440fde2019-10-24 12:46:13 -0700802 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700803 &convolution_op);
804 if (status == xnn_status_unsupported_hardware) {
805 GTEST_SKIP();
806 }
807 ASSERT_EQ(xnn_status_success, status);
808 ASSERT_NE(nullptr, convolution_op);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700809
810 // Smart pointer to automatically delete convolution_op.
811 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
812
813 ASSERT_EQ(xnn_status_success,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700814 xnn_setup_convolution2d_nhwc_qu8(
XNNPACK Teamb455b122019-09-27 18:10:33 -0700815 convolution_op,
816 batch_size(), input_height(), input_width(),
817 input.data(), output.data(),
818 nullptr /* thread pool */));
819
820 ASSERT_EQ(xnn_status_success,
821 xnn_run_operator(convolution_op, nullptr /* thread pool */));
822
823 // Verify results.
824 for (size_t i = 0; i < batch_size(); i++) {
825 for (size_t y = 0; y < output_height(); y++) {
826 for (size_t x = 0; x < output_width(); x++) {
827 for (size_t g = 0; g < groups(); g++) {
828 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700829 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
XNNPACK Teamb455b122019-09-27 18:10:33 -0700830 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700831 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
XNNPACK Teamb455b122019-09-27 18:10:33 -0700832 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
833 ASSERT_NEAR(
834 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700835 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700836 0.9)
837 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
838 }
839 }
840 }
841 }
842 }
843 }
844 }
845
Marat Dukhanefc47b82019-11-18 09:25:38 -0800846 void TestNHWCxF32() const {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700847 std::random_device random_device;
848 auto rng = std::mt19937(random_device());
849 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
850
851 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700852 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700853 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
854 std::vector<float> bias(groups() * group_output_channels());
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700855 std::vector<float> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700856 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
857
858 for (size_t iteration = 0; iteration < iterations(); iteration++) {
859 std::generate(input.begin(), input.end(), std::ref(f32rng));
860 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
861 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
862 std::fill(output.begin(), output.end(), nanf(""));
863
864 // Compute reference results, without clamping.
Marat Dukhanf568f082019-10-30 09:47:07 -0700865 if (has_bias()) {
866 for (size_t i = 0; i < batch_size(); i++) {
867 for (size_t oy = 0; oy < output_height(); oy++) {
868 for (size_t ox = 0; ox < output_width(); ox++) {
869 for (size_t g = 0; g < groups(); g++) {
870 for (size_t oc = 0; oc < group_output_channels(); oc++) {
871 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
872 bias[g * group_output_channels() + oc];
873 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700874 }
875 }
876 }
877 }
Marat Dukhanf568f082019-10-30 09:47:07 -0700878 } else {
879 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700880 }
881 if (depthwise_layout()) {
882 ASSERT_EQ(group_input_channels(), 1);
883
884 for (size_t i = 0; i < batch_size(); i++) {
885 for (size_t oy = 0; oy < output_height(); oy++) {
886 for (size_t ox = 0; ox < output_width(); ox++) {
887 for (size_t ky = 0; ky < kernel_height(); ky++) {
888 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
889 if (iy < input_height()) {
890 for (size_t kx = 0; kx < kernel_width(); kx++) {
891 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
892 if (ix < input_width()) {
893 for (size_t g = 0; g < groups(); g++) {
894 for (size_t oc = 0; oc < group_output_channels(); oc++) {
895 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700896 input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g] *
XNNPACK Teamb455b122019-09-27 18:10:33 -0700897 kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc];
898 }
899 }
900 }
901 }
902 }
903 }
904 }
905 }
906 }
907 } else {
908 for (size_t i = 0; i < batch_size(); i++) {
909 for (size_t oy = 0; oy < output_height(); oy++) {
910 for (size_t ox = 0; ox < output_width(); ox++) {
911 for (size_t ky = 0; ky < kernel_height(); ky++) {
912 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
913 if (iy < input_height()) {
914 for (size_t kx = 0; kx < kernel_width(); kx++) {
915 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
916 if (ix < input_width()) {
917 for (size_t g = 0; g < groups(); g++) {
918 for (size_t oc = 0; oc < group_output_channels(); oc++) {
919 for (size_t ic = 0; ic < group_input_channels(); ic++) {
920 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700921 input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] *
XNNPACK Teamb455b122019-09-27 18:10:33 -0700922 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
923 }
924 }
925 }
926 }
927 }
928 }
929 }
930 }
931 }
932 }
933 }
934
935 // Compute clamping parameters.
936 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
937 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
938
939 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
940 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
941
942 // Clamp reference results.
943 for (float& value : output_ref) {
944 value = std::max(std::min(value, output_max), output_min);
945 }
946
947 // Create, setup, run, and destroy Convolution operator.
Marat Dukhan04f03be2019-11-19 12:36:47 -0800948 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700949 xnn_operator_t convolution_op = nullptr;
950
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700951 xnn_status status = xnn_create_convolution2d_nhwc_f32(
Marat Dukhan8440fde2019-10-24 12:46:13 -0700952 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
953 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700954 kernel_height(), kernel_width(),
955 subsampling_height(), subsampling_width(),
956 dilation_height(), dilation_width(),
957 groups(), group_input_channels(), group_output_channels(),
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700958 input_channel_stride(), output_channel_stride(),
Marat Dukhanf568f082019-10-30 09:47:07 -0700959 kernel.data(), has_bias() ? bias.data() : nullptr,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700960 output_min, output_max,
Marat Dukhan8440fde2019-10-24 12:46:13 -0700961 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700962 &convolution_op);
963 if (status == xnn_status_unsupported_hardware) {
964 GTEST_SKIP();
965 }
966 ASSERT_EQ(xnn_status_success, status);
967 ASSERT_NE(nullptr, convolution_op);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700968
969 // Smart pointer to automatically delete convolution_op.
970 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
971
972 ASSERT_EQ(xnn_status_success,
973 xnn_setup_convolution2d_nhwc_f32(
974 convolution_op,
975 batch_size(), input_height(), input_width(),
976 input.data(), output.data(),
977 nullptr /* thread pool */));
978
979 ASSERT_EQ(xnn_status_success,
980 xnn_run_operator(convolution_op, nullptr /* thread pool */));
981
982 // Verify results.
983 for (size_t i = 0; i < batch_size(); i++) {
984 for (size_t y = 0; y < output_height(); y++) {
985 for (size_t x = 0; x < output_width(); x++) {
986 for (size_t g = 0; g < groups(); g++) {
987 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700988 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700989 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700990 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700991 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
992 ASSERT_NEAR(
993 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700994 output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c],
XNNPACK Teamb455b122019-09-27 18:10:33 -0700995 1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
996 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
997 }
998 }
999 }
1000 }
1001 }
1002 }
1003 }
1004
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001005 void TestNHWCxF16() const {
1006 std::random_device random_device;
1007 auto rng = std::mt19937(random_device());
1008 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
1009 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
1010
1011 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) +
1012 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
1013 std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1014 std::vector<uint16_t> bias(groups() * group_output_channels());
1015 std::vector<uint16_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
1016 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1017
1018 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1019 std::generate(input.begin(), input.end(), std::ref(f16rng));
1020 std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
1021 std::generate(bias.begin(), bias.end(), std::ref(f16rng));
1022 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
1023
1024 // Compute reference results, without clamping.
1025 if (has_bias()) {
1026 for (size_t i = 0; i < batch_size(); i++) {
1027 for (size_t oy = 0; oy < output_height(); oy++) {
1028 for (size_t ox = 0; ox < output_width(); ox++) {
1029 for (size_t g = 0; g < groups(); g++) {
1030 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1031 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1032 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
1033 }
1034 }
1035 }
1036 }
1037 }
1038 } else {
1039 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1040 }
1041 if (depthwise_layout()) {
1042 ASSERT_EQ(group_input_channels(), 1);
1043
1044 for (size_t i = 0; i < batch_size(); i++) {
1045 for (size_t oy = 0; oy < output_height(); oy++) {
1046 for (size_t ox = 0; ox < output_width(); ox++) {
1047 for (size_t ky = 0; ky < kernel_height(); ky++) {
1048 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1049 if (iy < input_height()) {
1050 for (size_t kx = 0; kx < kernel_width(); kx++) {
1051 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1052 if (ix < input_width()) {
1053 for (size_t g = 0; g < groups(); g++) {
1054 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1055 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1056 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g]) *
1057 fp16_ieee_to_fp32_value(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]);
1058 }
1059 }
1060 }
1061 }
1062 }
1063 }
1064 }
1065 }
1066 }
1067 } else {
1068 for (size_t i = 0; i < batch_size(); i++) {
1069 for (size_t oy = 0; oy < output_height(); oy++) {
1070 for (size_t ox = 0; ox < output_width(); ox++) {
1071 for (size_t ky = 0; ky < kernel_height(); ky++) {
1072 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1073 if (iy < input_height()) {
1074 for (size_t kx = 0; kx < kernel_width(); kx++) {
1075 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1076 if (ix < input_width()) {
1077 for (size_t g = 0; g < groups(); g++) {
1078 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1079 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1080 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1081 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) *
1082 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1083 }
1084 }
1085 }
1086 }
1087 }
1088 }
1089 }
1090 }
1091 }
1092 }
1093 }
1094
1095 // Compute clamping parameters.
1096 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1097 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1098 const float accumulated_range = accumulated_max - accumulated_min;
1099 const float scaled_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
1100 const float scaled_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
1101 const float output_min = scaled_min == scaled_max ? -std::numeric_limits<float>::infinity() : scaled_min;
1102 const float output_max = scaled_min == scaled_max ? +std::numeric_limits<float>::infinity() : scaled_max;
1103
1104 // Clamp reference results.
1105 for (float& value : output_ref) {
1106 value = std::max(std::min(value, output_max), output_min);
1107 }
1108
1109 // Create, setup, run, and destroy Convolution operator.
1110 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1111 xnn_operator_t convolution_op = nullptr;
1112
1113 xnn_status status = xnn_create_convolution2d_nhwc_f16(
1114 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
1115 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
1116 kernel_height(), kernel_width(),
1117 subsampling_height(), subsampling_width(),
1118 dilation_height(), dilation_width(),
1119 groups(), group_input_channels(), group_output_channels(),
1120 input_channel_stride(), output_channel_stride(),
1121 kernel.data(), has_bias() ? bias.data() : nullptr,
1122 output_min, output_max,
1123 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
1124 &convolution_op);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001125 if (status == xnn_status_unsupported_hardware) {
1126 GTEST_SKIP();
1127 }
1128 ASSERT_EQ(xnn_status_success, status);
1129 ASSERT_NE(nullptr, convolution_op);
1130
1131 // Smart pointer to automatically delete convolution_op.
1132 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1133
1134 ASSERT_EQ(xnn_status_success,
1135 xnn_setup_convolution2d_nhwc_f16(
1136 convolution_op,
1137 batch_size(), input_height(), input_width(),
1138 input.data(), output.data(),
1139 nullptr /* thread pool */));
1140
1141 ASSERT_EQ(xnn_status_success,
1142 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1143
1144 // Verify results.
1145 for (size_t i = 0; i < batch_size(); i++) {
1146 for (size_t y = 0; y < output_height(); y++) {
1147 for (size_t x = 0; x < output_width(); x++) {
1148 for (size_t g = 0; g < groups(); g++) {
1149 for (size_t c = 0; c < group_output_channels(); c++) {
1150// ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min)
1151// << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1152// ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max)
1153// << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1154 ASSERT_NEAR(
1155 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1156 fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]),
1157 1.0e-2 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
1158 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1159 }
1160 }
1161 }
1162 }
1163 }
1164 }
1165 }
1166
Marat Dukhanefc47b82019-11-18 09:25:38 -08001167 void TestNCHWxF32() const {
Marat Dukhanefc47b82019-11-18 09:25:38 -08001168 std::random_device random_device;
1169 auto rng = std::mt19937(random_device());
1170 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
1171 auto prng = std::bind(std::uniform_real_distribution<float>(), rng);
1172
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001173 std::vector<float> input(2 * XNN_EXTRA_BYTES / sizeof(float) +
1174 ((batch_size() - 1) * input_channel_stride() + groups() * group_input_channels()) * input_height() * input_width());
Marat Dukhanefc47b82019-11-18 09:25:38 -08001175 std::vector<float> kernel(
1176 groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1177 std::vector<float> bias(groups() * group_output_channels());
1178 std::vector<float> output(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001179 ((batch_size() - 1) * output_channel_stride() + groups() * group_output_channels()) * output_height() * output_width());
Marat Dukhanefc47b82019-11-18 09:25:38 -08001180 std::vector<float> output_ref(batch_size() * groups() * group_output_channels() * output_height() * output_width());
1181
1182 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1183 std::generate(input.begin(), input.end(), std::ref(f32rng));
1184 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
1185 for (float& k : kernel) {
1186 if (prng() <= sparsity()) {
1187 k = 0.0f;
1188 }
1189 }
1190 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
1191 std::fill(output.begin(), output.end(), nanf(""));
1192
1193 // Compute reference results, without clamping.
1194 if (has_bias()) {
1195 for (size_t i = 0; i < batch_size(); i++) {
1196 for (size_t oy = 0; oy < output_height(); oy++) {
1197 for (size_t ox = 0; ox < output_width(); ox++) {
1198 for (size_t g = 0; g < groups(); g++) {
1199 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1200 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] =
1201 bias[g * group_output_channels() + oc];
1202 }
1203 }
1204 }
1205 }
1206 }
1207 } else {
1208 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1209 }
1210 if (force_nhwc_input()) {
1211 for (size_t i = 0; i < batch_size(); i++) {
1212 for (size_t oy = 0; oy < output_height(); oy++) {
1213 for (size_t ox = 0; ox < output_width(); ox++) {
1214 for (size_t ky = 0; ky < kernel_height(); ky++) {
1215 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1216 if (iy < input_height()) {
1217 for (size_t kx = 0; kx < kernel_width(); kx++) {
1218 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1219 if (ix < input_width()) {
1220 for (size_t g = 0; g < groups(); g++) {
1221 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1222 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1223 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] +=
1224 input[((((i * input_height() + iy) * input_width() + ix) * groups() + g) * group_input_channels() + ic)] *
1225 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1226 }
1227 }
1228 }
1229 }
1230 }
1231 }
1232 }
1233 }
1234 }
1235 }
Marat Dukhan33032712020-06-18 11:06:04 -07001236 } else if (depthwise_layout()) {
1237 ASSERT_EQ(group_input_channels(), 1);
1238
1239 for (size_t i = 0; i < batch_size(); i++) {
1240 for (size_t oy = 0; oy < output_height(); oy++) {
1241 for (size_t ox = 0; ox < output_width(); ox++) {
1242 for (size_t ky = 0; ky < kernel_height(); ky++) {
1243 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1244 if (iy < input_height()) {
1245 for (size_t kx = 0; kx < kernel_width(); kx++) {
1246 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1247 if (ix < input_width()) {
1248 for (size_t g = 0; g < groups(); g++) {
1249 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1250 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] +=
1251 input[((i * input_channel_stride() + g) * input_height() + iy) * input_width() + ix] *
1252 kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc];
1253 }
1254 }
1255 }
1256 }
1257 }
1258 }
1259 }
1260 }
1261 }
Marat Dukhanefc47b82019-11-18 09:25:38 -08001262 } else {
1263 for (size_t i = 0; i < batch_size(); i++) {
1264 for (size_t oy = 0; oy < output_height(); oy++) {
1265 for (size_t ox = 0; ox < output_width(); ox++) {
1266 for (size_t ky = 0; ky < kernel_height(); ky++) {
1267 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1268 if (iy < input_height()) {
1269 for (size_t kx = 0; kx < kernel_width(); kx++) {
1270 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1271 if (ix < input_width()) {
1272 for (size_t g = 0; g < groups(); g++) {
1273 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1274 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1275 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001276 input[((i * input_channel_stride() + g * group_input_channels() + ic) * input_height() + iy) * input_width() + ix] *
Marat Dukhanefc47b82019-11-18 09:25:38 -08001277 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1278 }
1279 }
1280 }
1281 }
1282 }
1283 }
1284 }
1285 }
1286 }
1287 }
1288 }
1289
1290 // Compute clamping parameters.
1291 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1292 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1293
Marat Dukhan869c62d2020-04-09 17:17:55 -07001294 const float output_min = qmin() == 0 ? -std::numeric_limits<float>::infinity() :
1295 accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
1296 const float output_max = qmax() == 255 ? std::numeric_limits<float>::infinity() :
1297 accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
Marat Dukhanefc47b82019-11-18 09:25:38 -08001298
1299 // Clamp reference results.
1300 for (float& value : output_ref) {
1301 value = std::max(std::min(value, output_max), output_min);
1302 }
1303
1304 // Create, setup, run, and destroy Convolution operator.
Marat Dukhan04f03be2019-11-19 12:36:47 -08001305 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
Marat Dukhanefc47b82019-11-18 09:25:38 -08001306 xnn_operator_t convolution_op = nullptr;
1307
1308 xnn_status status = xnn_create_convolution2d_nchw_f32(
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001309 padding_top(), padding_right(), padding_bottom(), padding_left(),
1310 kernel_height(), kernel_width(),
1311 subsampling_height(), subsampling_width(),
1312 dilation_height(), dilation_width(),
1313 groups(), group_input_channels(), group_output_channels(),
1314 input_channel_stride(), output_channel_stride(),
1315 kernel.data(), has_bias() ? bias.data() : nullptr,
1316 output_min, output_max,
1317 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (force_nhwc_input() ? XNN_FLAG_INPUT_NHWC : 0),
1318 &convolution_op);
Marat Dukhanefc47b82019-11-18 09:25:38 -08001319 if (status == xnn_status_unsupported_parameter) {
1320 GTEST_SKIP();
1321 }
1322 ASSERT_EQ(xnn_status_success, status);
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001323 ASSERT_NE(nullptr, convolution_op);
Marat Dukhanefc47b82019-11-18 09:25:38 -08001324
1325 // Smart pointer to automatically delete convolution_op.
1326 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1327
1328 ASSERT_EQ(xnn_status_success,
1329 xnn_setup_convolution2d_nchw_f32(
1330 convolution_op,
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001331 batch_size(), input_height(), input_width(),
Marat Dukhanefc47b82019-11-18 09:25:38 -08001332 input.data(), output.data(),
1333 nullptr /* thread pool */));
1334
1335 ASSERT_EQ(xnn_status_success,
1336 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1337
1338 // Verify results.
1339 for (size_t i = 0; i < batch_size(); i++) {
1340 for (size_t y = 0; y < output_height(); y++) {
1341 for (size_t x = 0; x < output_width(); x++) {
1342 for (size_t g = 0; g < groups(); g++) {
1343 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001344 ASSERT_GE(output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x], output_min)
Marat Dukhanefc47b82019-11-18 09:25:38 -08001345 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001346 ASSERT_LE(output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x], output_max)
Marat Dukhanefc47b82019-11-18 09:25:38 -08001347 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i;
1348 ASSERT_NEAR(
1349 output_ref[(((i * groups() + g) * group_output_channels() + c) * output_height() + y) * output_width() + x],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001350 output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x],
Marat Dukhanefc47b82019-11-18 09:25:38 -08001351 1.0e-4 * std::abs(output_ref[(((i * groups() + g) * group_output_channels() + c) * output_height() + y) * output_width() + x]))
1352 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i;
1353 }
1354 }
1355 }
1356 }
1357 }
1358 }
1359 }
1360
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001361 void TestSetupNHWCxQS8() const {
1362 ASSERT_FALSE(depthwise_layout());
1363
1364 std::random_device random_device;
1365 auto rng = std::mt19937(random_device());
1366 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
1367 auto i8rng = std::bind(
1368 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), rng);
1369
1370 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max(
1371 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
1372 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())) + 8);
1373 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1374 std::vector<int32_t> bias(groups() * group_output_channels());
1375 std::vector<int8_t> output(std::max(
1376 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
1377 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
1378 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1379 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1380 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1381 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1382
1383 const int8_t input_zero_point = -1;
1384
1385 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1386 std::generate(input.begin(), input.end(), std::ref(i8rng));
1387 std::generate(kernel.begin(), kernel.end(), std::ref(i8rng));
1388 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
1389 std::fill(output.begin(), output.end(), 0xA5);
1390
1391 // Compute reference results, without renormalization.
1392 if (has_bias()) {
1393 for (size_t i = 0; i < batch_size(); i++) {
1394 for (size_t oy = 0; oy < output_height(); oy++) {
1395 for (size_t ox = 0; ox < output_width(); ox++) {
1396 for (size_t g = 0; g < groups(); g++) {
1397 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1398 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1399 bias[g * group_output_channels() + oc];
1400 }
1401 }
1402 }
1403 }
1404 }
1405 } else {
1406 std::fill(accumulators.begin(), accumulators.end(), 0);
1407 }
1408 for (size_t i = 0; i < batch_size(); i++) {
1409 for (size_t oy = 0; oy < output_height(); oy++) {
1410 for (size_t ox = 0; ox < output_width(); ox++) {
1411 for (size_t ky = 0; ky < kernel_height(); ky++) {
1412 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1413 if (iy < input_height()) {
1414 for (size_t kx = 0; kx < kernel_width(); kx++) {
1415 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1416 if (ix < input_width()) {
1417 for (size_t g = 0; g < groups(); g++) {
1418 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1419 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1420 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1421 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1422 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1423 }
1424 }
1425 }
1426 }
1427 }
1428 }
1429 }
1430 }
1431 }
1432 }
1433
1434 // Compute renormalization parameters.
1435 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
1436 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
1437
1438 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
1439 const int8_t output_zero_point = int8_t(std::max(std::min(
1440 lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
1441 long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
1442
1443 // Renormalize reference results.
1444 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
1445 [this, output_scale, output_zero_point](int32_t x) -> double {
1446 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
1447 });
1448
1449 // Create, setup, and run Convolution operator once.
1450 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1451 xnn_operator_t convolution_op = nullptr;
1452
1453 xnn_status status = xnn_create_convolution2d_nhwc_qs8(
1454 padding_top(), padding_right(), padding_bottom(), padding_left(),
1455 kernel_height(), kernel_width(),
1456 subsampling_height(), subsampling_width(),
1457 dilation_height(), dilation_width(),
1458 groups(), group_input_channels(), group_output_channels(),
1459 input_channel_stride(), output_channel_stride(),
1460 input_zero_point, 1.0f /* input scale */, 1.0f /* kernel scale */,
1461 kernel.data(), has_bias() ? bias.data() : nullptr,
1462 output_zero_point, output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
1463 0, &convolution_op);
1464 if (status == xnn_status_unsupported_hardware) {
1465 GTEST_SKIP();
1466 }
1467 ASSERT_EQ(xnn_status_success, status);
1468 ASSERT_NE(nullptr, convolution_op);
1469
1470 // Smart pointer to automatically delete convolution_op.
1471 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1472
1473 ASSERT_EQ(xnn_status_success,
1474 xnn_setup_convolution2d_nhwc_qs8(
1475 convolution_op,
1476 batch_size(), input_height(), input_width(),
1477 input.data(), output.data(),
1478 nullptr /* thread pool */));
1479
1480 ASSERT_EQ(xnn_status_success,
1481 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1482
1483 // Verify results of the first run.
1484 for (size_t i = 0; i < batch_size(); i++) {
1485 for (size_t y = 0; y < output_height(); y++) {
1486 for (size_t x = 0; x < output_width(); x++) {
1487 for (size_t g = 0; g < groups(); g++) {
1488 for (size_t c = 0; c < group_output_channels(); c++) {
1489 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
1490 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1491 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
1492 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1493 ASSERT_NEAR(
1494 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1495 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1496 0.9)
1497 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1498 }
1499 }
1500 }
1501 }
1502 }
1503
1504 // Re-generate data for the second run.
1505 std::generate(input.begin(), input.end(), std::ref(i8rng));
1506 std::fill(output.begin(), output.end(), 0xA5);
1507
1508 // Compute reference results for the second run, including renormalization.
1509 if (has_bias()) {
1510 for (size_t i = 0; i < next_batch_size(); i++) {
1511 for (size_t oy = 0; oy < next_output_height(); oy++) {
1512 for (size_t ox = 0; ox < next_output_width(); ox++) {
1513 for (size_t g = 0; g < groups(); g++) {
1514 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1515 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1516 bias[g * group_output_channels() + oc];
1517 }
1518 }
1519 }
1520 }
1521 }
1522 } else {
1523 std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
1524 }
1525 for (size_t i = 0; i < next_batch_size(); i++) {
1526 for (size_t oy = 0; oy < next_output_height(); oy++) {
1527 for (size_t ox = 0; ox < next_output_width(); ox++) {
1528 for (size_t ky = 0; ky < kernel_height(); ky++) {
1529 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1530 if (iy < next_input_height()) {
1531 for (size_t kx = 0; kx < kernel_width(); kx++) {
1532 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1533 if (ix < next_input_width()) {
1534 for (size_t g = 0; g < groups(); g++) {
1535 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1536 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1537 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1538 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1539 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1540 }
1541 }
1542 }
1543 }
1544 }
1545 }
1546 }
1547 }
1548 }
1549 }
1550 std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
1551 [this, output_scale, output_zero_point](int32_t x) -> double {
1552 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
1553 });
1554
1555 // Setup and run Convolution operator the second time, and destroy the operator.
1556 ASSERT_EQ(xnn_status_success,
1557 xnn_setup_convolution2d_nhwc_qs8(
1558 convolution_op,
1559 next_batch_size(), next_input_height(), next_input_width(),
1560 input.data(), output.data(),
1561 nullptr /* thread pool */));
1562
1563 ASSERT_EQ(xnn_status_success,
1564 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1565
1566 // Verify results of the second run.
1567 for (size_t i = 0; i < next_batch_size(); i++) {
1568 for (size_t y = 0; y < next_output_height(); y++) {
1569 for (size_t x = 0; x < next_output_width(); x++) {
1570 for (size_t g = 0; g < groups(); g++) {
1571 for (size_t c = 0; c < group_output_channels(); c++) {
1572 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
1573 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1574 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
1575 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1576 ASSERT_NEAR(
1577 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
1578 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1579 0.9)
1580 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1581 }
1582 }
1583 }
1584 }
1585 }
1586 }
1587 }
1588
Marat Dukhan08b7a972020-07-14 18:17:29 -07001589 void TestSetupNHWCxQU8() const {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001590 ASSERT_FALSE(depthwise_layout());
1591
1592 std::random_device random_device;
1593 auto rng = std::mt19937(random_device());
Marat Dukhanecd83112020-08-03 21:50:28 -07001594 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
Marat Dukhan5ce30d92020-04-14 03:31:26 -07001595 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001596
1597 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001598 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
1599 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())) + 8);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001600 std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1601 std::vector<int32_t> bias(groups() * group_output_channels());
1602 std::vector<uint8_t> output(std::max(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001603 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
1604 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
XNNPACK Teamb455b122019-09-27 18:10:33 -07001605 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1606 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1607 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1608 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1609
1610 const uint8_t input_zero_point = 127;
1611 const uint8_t kernel_zero_point = 127;
1612
1613 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1614 std::generate(input.begin(), input.end(), std::ref(u8rng));
1615 std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
Marat Dukhanecd83112020-08-03 21:50:28 -07001616 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -07001617 std::fill(output.begin(), output.end(), 0xA5);
1618
1619 // Compute reference results, without renormalization.
Marat Dukhanf568f082019-10-30 09:47:07 -07001620 if (has_bias()) {
1621 for (size_t i = 0; i < batch_size(); i++) {
1622 for (size_t oy = 0; oy < output_height(); oy++) {
1623 for (size_t ox = 0; ox < output_width(); ox++) {
1624 for (size_t g = 0; g < groups(); g++) {
1625 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1626 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1627 bias[g * group_output_channels() + oc];
1628 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001629 }
1630 }
1631 }
1632 }
Marat Dukhanf568f082019-10-30 09:47:07 -07001633 } else {
1634 std::fill(accumulators.begin(), accumulators.end(), 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001635 }
1636 for (size_t i = 0; i < batch_size(); i++) {
1637 for (size_t oy = 0; oy < output_height(); oy++) {
1638 for (size_t ox = 0; ox < output_width(); ox++) {
1639 for (size_t ky = 0; ky < kernel_height(); ky++) {
1640 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1641 if (iy < input_height()) {
1642 for (size_t kx = 0; kx < kernel_width(); kx++) {
1643 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1644 if (ix < input_width()) {
1645 for (size_t g = 0; g < groups(); g++) {
1646 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1647 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1648 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001649 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
XNNPACK Teamb455b122019-09-27 18:10:33 -07001650 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
1651 }
1652 }
1653 }
1654 }
1655 }
1656 }
1657 }
1658 }
1659 }
1660 }
1661
1662 // Compute renormalization parameters.
1663 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
1664 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
1665
1666 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
1667 const uint8_t output_zero_point = uint8_t(std::max(std::min(
1668 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
1669 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
1670
1671 // Renormalize reference results.
1672 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
1673 [this, output_scale, output_zero_point](int32_t x) -> double {
1674 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
1675 });
1676
1677 // Create, setup, and run Convolution operator once.
Marat Dukhan04f03be2019-11-19 12:36:47 -08001678 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
XNNPACK Teamb455b122019-09-27 18:10:33 -07001679 xnn_operator_t convolution_op = nullptr;
1680
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001681 xnn_status status = xnn_create_convolution2d_nhwc_qu8(
XNNPACK Teamb455b122019-09-27 18:10:33 -07001682 padding_top(), padding_right(), padding_bottom(), padding_left(),
1683 kernel_height(), kernel_width(),
1684 subsampling_height(), subsampling_width(),
1685 dilation_height(), dilation_width(),
1686 groups(), group_input_channels(), group_output_channels(),
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001687 input_channel_stride(), output_channel_stride(),
XNNPACK Teamb455b122019-09-27 18:10:33 -07001688 input_zero_point, 1.0f /* input scale */,
1689 kernel_zero_point, 1.0f /* kernel scale */,
Marat Dukhanf568f082019-10-30 09:47:07 -07001690 kernel.data(), has_bias() ? bias.data() : nullptr,
XNNPACK Teamb455b122019-09-27 18:10:33 -07001691 output_zero_point, output_scale, qmin(), qmax(),
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001692 0, &convolution_op);
1693 if (status == xnn_status_unsupported_hardware) {
1694 GTEST_SKIP();
1695 }
1696 ASSERT_EQ(xnn_status_success, status);
1697 ASSERT_NE(nullptr, convolution_op);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001698
1699 // Smart pointer to automatically delete convolution_op.
1700 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1701
1702 ASSERT_EQ(xnn_status_success,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001703 xnn_setup_convolution2d_nhwc_qu8(
XNNPACK Teamb455b122019-09-27 18:10:33 -07001704 convolution_op,
1705 batch_size(), input_height(), input_width(),
1706 input.data(), output.data(),
1707 nullptr /* thread pool */));
1708
1709 ASSERT_EQ(xnn_status_success,
1710 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1711
1712 // Verify results of the first run.
1713 for (size_t i = 0; i < batch_size(); i++) {
1714 for (size_t y = 0; y < output_height(); y++) {
1715 for (size_t x = 0; x < output_width(); x++) {
1716 for (size_t g = 0; g < groups(); g++) {
1717 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001718 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07001719 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001720 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07001721 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1722 ASSERT_NEAR(
1723 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001724 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
XNNPACK Teamb455b122019-09-27 18:10:33 -07001725 0.9)
1726 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1727 }
1728 }
1729 }
1730 }
1731 }
1732
1733 // Re-generate data for the second run.
1734 std::generate(input.begin(), input.end(), std::ref(u8rng));
1735 std::fill(output.begin(), output.end(), 0xA5);
1736
1737 // Compute reference results for the second run, including renormalization.
Marat Dukhanf568f082019-10-30 09:47:07 -07001738 if (has_bias()) {
1739 for (size_t i = 0; i < next_batch_size(); i++) {
1740 for (size_t oy = 0; oy < next_output_height(); oy++) {
1741 for (size_t ox = 0; ox < next_output_width(); ox++) {
1742 for (size_t g = 0; g < groups(); g++) {
1743 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1744 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1745 bias[g * group_output_channels() + oc];
1746 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001747 }
1748 }
1749 }
1750 }
Marat Dukhanf568f082019-10-30 09:47:07 -07001751 } else {
1752 std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001753 }
1754 for (size_t i = 0; i < next_batch_size(); i++) {
1755 for (size_t oy = 0; oy < next_output_height(); oy++) {
1756 for (size_t ox = 0; ox < next_output_width(); ox++) {
1757 for (size_t ky = 0; ky < kernel_height(); ky++) {
1758 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1759 if (iy < next_input_height()) {
1760 for (size_t kx = 0; kx < kernel_width(); kx++) {
1761 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1762 if (ix < next_input_width()) {
1763 for (size_t g = 0; g < groups(); g++) {
1764 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1765 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1766 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001767 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
XNNPACK Teamb455b122019-09-27 18:10:33 -07001768 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
1769 }
1770 }
1771 }
1772 }
1773 }
1774 }
1775 }
1776 }
1777 }
1778 }
1779 std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
1780 [this, output_scale, output_zero_point](int32_t x) -> double {
1781 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
1782 });
1783
1784 // Setup and run Convolution operator the second time, and destroy the operator.
1785 ASSERT_EQ(xnn_status_success,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001786 xnn_setup_convolution2d_nhwc_qu8(
XNNPACK Teamb455b122019-09-27 18:10:33 -07001787 convolution_op,
1788 next_batch_size(), next_input_height(), next_input_width(),
1789 input.data(), output.data(),
1790 nullptr /* thread pool */));
1791
1792 ASSERT_EQ(xnn_status_success,
1793 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1794
1795 // Verify results of the second run.
1796 for (size_t i = 0; i < next_batch_size(); i++) {
1797 for (size_t y = 0; y < next_output_height(); y++) {
1798 for (size_t x = 0; x < next_output_width(); x++) {
1799 for (size_t g = 0; g < groups(); g++) {
1800 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001801 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07001802 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001803 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07001804 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1805 ASSERT_NEAR(
1806 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001807 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
XNNPACK Teamb455b122019-09-27 18:10:33 -07001808 0.9)
1809 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1810 }
1811 }
1812 }
1813 }
1814 }
1815 }
1816 }
1817
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001818 void TestSetupNHWCxF16() const {
1819 ASSERT_FALSE(depthwise_layout());
1820
1821 std::random_device random_device;
1822 auto rng = std::mt19937(random_device());
1823 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
1824 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
1825
1826 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max(
1827 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
1828 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
1829 std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1830 std::vector<uint16_t> bias(groups() * group_output_channels());
1831 std::vector<uint16_t> output(std::max(
1832 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
1833 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
1834 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1835 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1836
1837 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1838 std::generate(input.begin(), input.end(), std::ref(f16rng));
1839 std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
1840 std::generate(bias.begin(), bias.end(), std::ref(f16rng));
1841 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
1842
1843 // Compute reference results, without clamping.
1844 if (has_bias()) {
1845 for (size_t i = 0; i < batch_size(); i++) {
1846 for (size_t oy = 0; oy < output_height(); oy++) {
1847 for (size_t ox = 0; ox < output_width(); ox++) {
1848 for (size_t g = 0; g < groups(); g++) {
1849 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1850 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1851 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
1852 }
1853 }
1854 }
1855 }
1856 }
1857 } else {
1858 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1859 }
1860 for (size_t i = 0; i < batch_size(); i++) {
1861 for (size_t oy = 0; oy < output_height(); oy++) {
1862 for (size_t ox = 0; ox < output_width(); ox++) {
1863 for (size_t ky = 0; ky < kernel_height(); ky++) {
1864 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1865 if (iy < input_height()) {
1866 for (size_t kx = 0; kx < kernel_width(); kx++) {
1867 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1868 if (ix < input_width()) {
1869 for (size_t g = 0; g < groups(); g++) {
1870 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1871 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1872 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1873 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) *
1874 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1875 }
1876 }
1877 }
1878 }
1879 }
1880 }
1881 }
1882 }
1883 }
1884 }
1885
1886 // Compute clamping parameters.
1887 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1888 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1889 const float accumulated_range = accumulated_max - accumulated_min;
1890 const float scaled_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
1891 const float scaled_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
1892 const float output_min = scaled_min == scaled_max ? -std::numeric_limits<float>::infinity() : scaled_min;
1893 const float output_max = scaled_min == scaled_max ? +std::numeric_limits<float>::infinity() : scaled_max;
1894
1895 for (float& output_value : output_ref) {
1896 output_value = std::min(std::max(output_value, output_min), output_max);
1897 }
1898
1899 // Create, setup, and run Convolution operator once.
1900 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1901 xnn_operator_t convolution_op = nullptr;
1902
1903 xnn_status status = xnn_create_convolution2d_nhwc_f16(
1904 padding_top(), padding_right(), padding_bottom(), padding_left(),
1905 kernel_height(), kernel_width(),
1906 subsampling_height(), subsampling_width(),
1907 dilation_height(), dilation_width(),
1908 groups(), group_input_channels(), group_output_channels(),
1909 input_channel_stride(), output_channel_stride(),
1910 kernel.data(), has_bias() ? bias.data() : nullptr,
1911 output_min, output_max,
1912 0, &convolution_op);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001913 if (status == xnn_status_unsupported_hardware) {
1914 GTEST_SKIP();
1915 }
1916 ASSERT_EQ(xnn_status_success, status);
1917 ASSERT_NE(nullptr, convolution_op);
1918
1919 // Smart pointer to automatically delete convolution_op.
1920 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1921
1922 ASSERT_EQ(xnn_status_success,
1923 xnn_setup_convolution2d_nhwc_f16(
1924 convolution_op,
1925 batch_size(), input_height(), input_width(),
1926 input.data(), output.data(),
1927 nullptr /* thread pool */));
1928
1929 ASSERT_EQ(xnn_status_success,
1930 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1931
1932 // Verify results of the first run.
1933 for (size_t i = 0; i < batch_size(); i++) {
1934 for (size_t y = 0; y < output_height(); y++) {
1935 for (size_t x = 0; x < output_width(); x++) {
1936 for (size_t g = 0; g < groups(); g++) {
1937 for (size_t c = 0; c < group_output_channels(); c++) {
1938 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min)
1939 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1940 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max)
1941 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1942 ASSERT_NEAR(
1943 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1944 fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]),
1945 1.0e-2 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
1946 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1947 }
1948 }
1949 }
1950 }
1951 }
1952
1953 // Re-generate data for the second run.
1954 std::generate(input.begin(), input.end(), std::ref(f16rng));
1955 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
1956
1957 // Compute reference results for the second run, including clamping.
1958 if (has_bias()) {
1959 for (size_t i = 0; i < next_batch_size(); i++) {
1960 for (size_t oy = 0; oy < next_output_height(); oy++) {
1961 for (size_t ox = 0; ox < next_output_width(); ox++) {
1962 for (size_t g = 0; g < groups(); g++) {
1963 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1964 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1965 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
1966 }
1967 }
1968 }
1969 }
1970 }
1971 } else {
1972 std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f);
1973 }
1974 for (size_t i = 0; i < next_batch_size(); i++) {
1975 for (size_t oy = 0; oy < next_output_height(); oy++) {
1976 for (size_t ox = 0; ox < next_output_width(); ox++) {
1977 for (size_t ky = 0; ky < kernel_height(); ky++) {
1978 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1979 if (iy < next_input_height()) {
1980 for (size_t kx = 0; kx < kernel_width(); kx++) {
1981 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1982 if (ix < next_input_width()) {
1983 for (size_t g = 0; g < groups(); g++) {
1984 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1985 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1986 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1987 fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) *
1988 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1989 }
1990 }
1991 }
1992 }
1993 }
1994 }
1995 }
1996 }
1997 }
1998 }
1999 for (float& value : next_output_ref) {
2000 value = std::max(std::min(value, output_max), output_min);
2001 }
2002
2003 // Setup and run Convolution operator the second time, and destroy the operator.
2004 ASSERT_EQ(xnn_status_success,
2005 xnn_setup_convolution2d_nhwc_f16(
2006 convolution_op,
2007 next_batch_size(), next_input_height(), next_input_width(),
2008 input.data(), output.data(),
2009 nullptr /* thread pool */));
2010
2011 ASSERT_EQ(xnn_status_success,
2012 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2013
2014 // Verify results of the second run.
2015 for (size_t i = 0; i < next_batch_size(); i++) {
2016 for (size_t y = 0; y < next_output_height(); y++) {
2017 for (size_t x = 0; x < next_output_width(); x++) {
2018 for (size_t g = 0; g < groups(); g++) {
2019 for (size_t c = 0; c < group_output_channels(); c++) {
2020 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min)
2021 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2022 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max)
2023 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2024 ASSERT_NEAR(
2025 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
2026 fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]),
2027 1.0e-2 * std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]))
2028 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2029 }
2030 }
2031 }
2032 }
2033 }
2034 }
2035 }
2036
Marat Dukhanefc47b82019-11-18 09:25:38 -08002037 void TestSetupNHWCxF32() const {
XNNPACK Teamb455b122019-09-27 18:10:33 -07002038 ASSERT_FALSE(depthwise_layout());
2039
2040 std::random_device random_device;
2041 auto rng = std::mt19937(random_device());
2042 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
2043
2044 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + std::max(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002045 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
2046 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002047 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
2048 std::vector<float> bias(groups() * group_output_channels());
2049 std::vector<float> output(std::max(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002050 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
2051 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002052 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2053 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2054
2055 for (size_t iteration = 0; iteration < iterations(); iteration++) {
2056 std::generate(input.begin(), input.end(), std::ref(f32rng));
2057 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
2058 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
2059 std::fill(output.begin(), output.end(), nanf(""));
2060
2061 // Compute reference results, without clamping.
Marat Dukhanf568f082019-10-30 09:47:07 -07002062 if (has_bias()) {
2063 for (size_t i = 0; i < batch_size(); i++) {
2064 for (size_t oy = 0; oy < output_height(); oy++) {
2065 for (size_t ox = 0; ox < output_width(); ox++) {
2066 for (size_t g = 0; g < groups(); g++) {
2067 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2068 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2069 bias[g * group_output_channels() + oc];
2070 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07002071 }
2072 }
2073 }
2074 }
Marat Dukhanf568f082019-10-30 09:47:07 -07002075 } else {
2076 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
XNNPACK Teamb455b122019-09-27 18:10:33 -07002077 }
2078 for (size_t i = 0; i < batch_size(); i++) {
2079 for (size_t oy = 0; oy < output_height(); oy++) {
2080 for (size_t ox = 0; ox < output_width(); ox++) {
2081 for (size_t ky = 0; ky < kernel_height(); ky++) {
2082 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2083 if (iy < input_height()) {
2084 for (size_t kx = 0; kx < kernel_width(); kx++) {
2085 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2086 if (ix < input_width()) {
2087 for (size_t g = 0; g < groups(); g++) {
2088 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2089 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2090 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002091 input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] *
XNNPACK Teamb455b122019-09-27 18:10:33 -07002092 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
2093 }
2094 }
2095 }
2096 }
2097 }
2098 }
2099 }
2100 }
2101 }
2102 }
2103
2104 // Compute clamping parameters.
2105 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
2106 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
2107
2108 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
2109 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
2110
2111 // Clamp reference results.
2112 for (float& value : output_ref) {
2113 value = std::max(std::min(value, output_max), output_min);
2114 }
2115
2116 // Create, setup, and run Convolution operator once.
Marat Dukhan04f03be2019-11-19 12:36:47 -08002117 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002118 xnn_operator_t convolution_op = nullptr;
2119
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002120 xnn_status status = xnn_create_convolution2d_nhwc_f32(
XNNPACK Teamb455b122019-09-27 18:10:33 -07002121 padding_top(), padding_right(), padding_bottom(), padding_left(),
2122 kernel_height(), kernel_width(),
2123 subsampling_height(), subsampling_width(),
2124 dilation_height(), dilation_width(),
2125 groups(), group_input_channels(), group_output_channels(),
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002126 input_channel_stride(), output_channel_stride(),
Marat Dukhanf568f082019-10-30 09:47:07 -07002127 kernel.data(), has_bias() ? bias.data() : nullptr,
XNNPACK Teamb455b122019-09-27 18:10:33 -07002128 output_min, output_max,
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002129 0, &convolution_op);
2130 if (status == xnn_status_unsupported_hardware) {
2131 GTEST_SKIP();
2132 }
2133 ASSERT_EQ(xnn_status_success, status);
2134 ASSERT_NE(nullptr, convolution_op);
XNNPACK Teamb455b122019-09-27 18:10:33 -07002135
2136 // Smart pointer to automatically delete convolution_op.
2137 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
2138
2139 ASSERT_EQ(xnn_status_success,
2140 xnn_setup_convolution2d_nhwc_f32(
2141 convolution_op,
2142 batch_size(), input_height(), input_width(),
2143 input.data(), output.data(),
2144 nullptr /* thread pool */));
2145
2146 ASSERT_EQ(xnn_status_success,
2147 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2148
2149 // Verify results of the first run.
2150 for (size_t i = 0; i < batch_size(); i++) {
2151 for (size_t y = 0; y < output_height(); y++) {
2152 for (size_t x = 0; x < output_width(); x++) {
2153 for (size_t g = 0; g < groups(); g++) {
2154 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002155 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min)
XNNPACK Teamb455b122019-09-27 18:10:33 -07002156 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002157 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max)
XNNPACK Teamb455b122019-09-27 18:10:33 -07002158 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2159 ASSERT_NEAR(
2160 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002161 output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c],
XNNPACK Teamb455b122019-09-27 18:10:33 -07002162 1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
2163 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2164 }
2165 }
2166 }
2167 }
2168 }
2169
2170 // Re-generate data for the second run.
2171 std::generate(input.begin(), input.end(), std::ref(f32rng));
2172 std::fill(output.begin(), output.end(), nanf(""));
2173
2174 // Compute reference results for the second run, including clamping.
Marat Dukhanf568f082019-10-30 09:47:07 -07002175 if (has_bias()) {
2176 for (size_t i = 0; i < next_batch_size(); i++) {
2177 for (size_t oy = 0; oy < next_output_height(); oy++) {
2178 for (size_t ox = 0; ox < next_output_width(); ox++) {
2179 for (size_t g = 0; g < groups(); g++) {
2180 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2181 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2182 bias[g * group_output_channels() + oc];
2183 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07002184 }
2185 }
2186 }
2187 }
Marat Dukhanf568f082019-10-30 09:47:07 -07002188 } else {
2189 std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f);
XNNPACK Teamb455b122019-09-27 18:10:33 -07002190 }
2191 for (size_t i = 0; i < next_batch_size(); i++) {
2192 for (size_t oy = 0; oy < next_output_height(); oy++) {
2193 for (size_t ox = 0; ox < next_output_width(); ox++) {
2194 for (size_t ky = 0; ky < kernel_height(); ky++) {
2195 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2196 if (iy < next_input_height()) {
2197 for (size_t kx = 0; kx < kernel_width(); kx++) {
2198 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2199 if (ix < next_input_width()) {
2200 for (size_t g = 0; g < groups(); g++) {
2201 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2202 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2203 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002204 input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] *
XNNPACK Teamb455b122019-09-27 18:10:33 -07002205 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
2206 }
2207 }
2208 }
2209 }
2210 }
2211 }
2212 }
2213 }
2214 }
2215 }
2216 for (float& value : next_output_ref) {
2217 value = std::max(std::min(value, output_max), output_min);
2218 }
2219
2220 // Setup and run Convolution operator the second time, and destroy the operator.
2221 ASSERT_EQ(xnn_status_success,
2222 xnn_setup_convolution2d_nhwc_f32(
2223 convolution_op,
2224 next_batch_size(), next_input_height(), next_input_width(),
2225 input.data(), output.data(),
2226 nullptr /* thread pool */));
2227
2228 ASSERT_EQ(xnn_status_success,
2229 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2230
2231 // Verify results of the second run.
2232 for (size_t i = 0; i < next_batch_size(); i++) {
2233 for (size_t y = 0; y < next_output_height(); y++) {
2234 for (size_t x = 0; x < next_output_width(); x++) {
2235 for (size_t g = 0; g < groups(); g++) {
2236 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002237 ASSERT_GE(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min)
XNNPACK Teamb455b122019-09-27 18:10:33 -07002238 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002239 ASSERT_LE(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max)
XNNPACK Teamb455b122019-09-27 18:10:33 -07002240 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2241 ASSERT_NEAR(
2242 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002243 output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c],
XNNPACK Teamb455b122019-09-27 18:10:33 -07002244 1.0e-4 * std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]))
2245 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2246 }
2247 }
2248 }
2249 }
2250 }
2251 }
2252 }
2253
2254 private:
2255 uint32_t padding_top_{0};
2256 uint32_t padding_right_{0};
2257 uint32_t padding_bottom_{0};
2258 uint32_t padding_left_{0};
Marat Dukhan8440fde2019-10-24 12:46:13 -07002259 bool padding_tf_same_{false};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002260 size_t input_height_{1};
2261 size_t input_width_{1};
2262 uint32_t groups_{1};
2263 size_t group_input_channels_{1};
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002264 size_t input_channel_stride_{0};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002265 size_t group_output_channels_{1};
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002266 size_t output_channel_stride_{0};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002267 size_t batch_size_{1};
2268 uint32_t kernel_height_{1};
2269 uint32_t kernel_width_{1};
2270 uint32_t dilation_height_{1};
2271 uint32_t dilation_width_{1};
2272 uint32_t subsampling_height_{1};
2273 uint32_t subsampling_width_{1};
2274 size_t next_input_height_{0};
2275 size_t next_input_width_{0};
2276 size_t next_batch_size_{0};
Marat Dukhanefc47b82019-11-18 09:25:38 -08002277 float sparsity_{0.0f};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002278 uint8_t qmin_{0};
2279 uint8_t qmax_{255};
2280 bool depthwise_layout_{false};
Marat Dukhanefc47b82019-11-18 09:25:38 -08002281 bool force_nhwc_input_{false};
Marat Dukhanf568f082019-10-30 09:47:07 -07002282 bool has_bias_{true};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002283 size_t iterations_{1};
2284};