blob: 8d6622555e807d6e3b5f0818d791b5f2f71faf41 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#pragma once
10
11#include <gtest/gtest.h>
12
13#include <algorithm>
14#include <cassert>
15#include <cmath>
16#include <cstddef>
17#include <cstdlib>
18#include <functional>
Marat Dukhan5ce30d92020-04-14 03:31:26 -070019#include <limits>
XNNPACK Teamb455b122019-09-27 18:10:33 -070020#include <random>
21#include <vector>
22
Frank Barchard49b4dcc2020-06-26 14:07:19 -070023#include <fp16.h>
24
XNNPACK Teamb455b122019-09-27 18:10:33 -070025#include <xnnpack.h>
26
27
28class ConvolutionOperatorTester {
29 public:
Marat Dukhan8440fde2019-10-24 12:46:13 -070030 inline ConvolutionOperatorTester& padding_tf_same(bool padding_same) {
31 if (padding_same) {
32 assert(padding_top() == 0);
33 assert(padding_left() == 0);
34 assert(padding_bottom() == 0);
35 assert(padding_right() == 0);
36 }
37 this->padding_tf_same_ = padding_same;
38 return *this;
39 }
40
41 inline bool padding_tf_same() const {
42 return this->padding_tf_same_;
43 }
44
XNNPACK Teamb455b122019-09-27 18:10:33 -070045 inline ConvolutionOperatorTester& padding(uint32_t padding) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070046 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070047 this->padding_top_ = padding;
48 this->padding_right_ = padding;
49 this->padding_bottom_ = padding;
50 this->padding_left_ = padding;
51 return *this;
52 }
53
54 inline ConvolutionOperatorTester& padding(uint32_t padding_height, uint32_t padding_width) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070055 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070056 this->padding_top_ = padding_height;
57 this->padding_right_ = padding_width;
58 this->padding_bottom_ = padding_height;
59 this->padding_left_ = padding_width;
60 return *this;
61 }
62
63 inline ConvolutionOperatorTester& padding_height(uint32_t padding_height) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070064 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070065 this->padding_top_ = padding_height;
66 this->padding_bottom_ = padding_height;
67 return *this;
68 }
69
70 inline ConvolutionOperatorTester& padding_width(uint32_t padding_width) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070071 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070072 this->padding_right_ = padding_width;
73 this->padding_left_ = padding_width;
74 return *this;
75 }
76
77 inline ConvolutionOperatorTester& padding_top(uint32_t padding_top) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070078 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070079 this->padding_top_ = padding_top;
80 return *this;
81 }
82
83 inline uint32_t padding_top() const {
Marat Dukhan8440fde2019-10-24 12:46:13 -070084 if (padding_tf_same()) {
85 const uint32_t total_padding_height =
86 (output_height() - 1) * subsampling_height() + dilated_kernel_height() - input_height();
87 return total_padding_height / 2;
88 } else {
89 return this->padding_top_;
90 }
XNNPACK Teamb455b122019-09-27 18:10:33 -070091 }
92
93 inline ConvolutionOperatorTester& padding_left(uint32_t padding_left) {
Marat Dukhan8440fde2019-10-24 12:46:13 -070094 assert(!padding_tf_same());
XNNPACK Teamb455b122019-09-27 18:10:33 -070095 this->padding_left_ = padding_left;
96 return *this;
97 }
98
99 inline uint32_t padding_left() const {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700100 if (padding_tf_same()) {
101 const uint32_t total_padding_width =
102 (output_width() - 1) * subsampling_width() + dilated_kernel_width() - input_width();
103 return total_padding_width / 2;
104 } else {
105 return this->padding_left_;
106 }
107 }
108
109 inline ConvolutionOperatorTester& padding_bottom(uint32_t padding_bottom) {
110 assert(!padding_tf_same());
111 this->padding_bottom_ = padding_bottom;
112 return *this;
113 }
114
115 inline uint32_t padding_bottom() const {
116 if (padding_tf_same()) {
117 const uint32_t total_padding_height =
118 (output_height() - 1) * subsampling_height() + dilated_kernel_height() - input_height();
119 return total_padding_height - total_padding_height / 2;
120 } else {
121 return this->padding_bottom_;
122 }
123 }
124
125 inline ConvolutionOperatorTester& padding_right(uint32_t padding_right) {
126 assert(!padding_tf_same());
127 this->padding_right_ = padding_right;
128 return *this;
129 }
130
131 inline uint32_t padding_right() const {
132 if (padding_tf_same()) {
133 const uint32_t total_padding_width =
134 (output_width() - 1) * subsampling_width() + dilated_kernel_width() - input_width();
135 return total_padding_width - total_padding_width / 2;
136 } else {
137 return this->padding_right_;
138 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700139 }
140
141 inline ConvolutionOperatorTester& input_size(uint32_t input_height, uint32_t input_width) {
142 assert(input_height >= 1);
143 assert(input_width >= 1);
144 this->input_height_ = input_height;
145 this->input_width_ = input_width;
146 return *this;
147 }
148
149 inline ConvolutionOperatorTester& input_height(uint32_t input_height) {
150 assert(input_height >= 1);
151 this->input_height_ = input_height;
152 return *this;
153 }
154
155 inline uint32_t input_height() const {
156 return this->input_height_;
157 }
158
159 inline ConvolutionOperatorTester& input_width(uint32_t input_width) {
160 assert(input_width >= 1);
161 this->input_width_ = input_width;
162 return *this;
163 }
164
165 inline uint32_t input_width() const {
166 return this->input_width_;
167 }
168
169 inline ConvolutionOperatorTester& groups(uint32_t groups) {
170 assert(groups >= 1);
171 this->groups_ = groups;
172 return *this;
173 }
174
175 inline uint32_t groups() const {
176 return this->groups_;
177 }
178
179 inline ConvolutionOperatorTester& group_input_channels(size_t group_input_channels) {
180 assert(group_input_channels >= 1);
181 this->group_input_channels_ = group_input_channels;
182 return *this;
183 }
184
185 inline size_t group_input_channels() const {
186 return this->group_input_channels_;
187 }
188
189 inline ConvolutionOperatorTester& group_output_channels(size_t group_output_channels) {
190 assert(group_output_channels >= 1);
191 this->group_output_channels_ = group_output_channels;
192 return *this;
193 }
194
195 inline size_t group_output_channels() const {
196 return this->group_output_channels_;
197 }
198
199 inline ConvolutionOperatorTester& batch_size(size_t batch_size) {
200 assert(batch_size >= 1);
201 this->batch_size_ = batch_size;
202 return *this;
203 }
204
205 inline size_t batch_size() const {
206 return this->batch_size_;
207 }
208
209 inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_size) {
210 assert(kernel_size >= 1);
211 this->kernel_height_ = kernel_size;
212 this->kernel_width_ = kernel_size;
213 return *this;
214 }
215
216 inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_height, uint32_t kernel_width) {
217 assert(kernel_height >= 1);
218 assert(kernel_width >= 1);
219 this->kernel_height_ = kernel_height;
220 this->kernel_width_ = kernel_width;
221 return *this;
222 }
223
224 inline ConvolutionOperatorTester& kernel_height(uint32_t kernel_height) {
225 assert(kernel_height >= 1);
226 this->kernel_height_ = kernel_height;
227 return *this;
228 }
229
230 inline uint32_t kernel_height() const {
231 return this->kernel_height_;
232 }
233
234 inline ConvolutionOperatorTester& kernel_width(uint32_t kernel_width) {
235 assert(kernel_width >= 1);
236 this->kernel_width_ = kernel_width;
237 return *this;
238 }
239
240 inline uint32_t kernel_width() const {
241 return this->kernel_width_;
242 }
243
244 inline ConvolutionOperatorTester& dilation(uint32_t dilation) {
245 assert(dilation >= 1);
246 this->dilation_height_ = dilation;
247 this->dilation_width_ = dilation;
248 return *this;
249 }
250
251 inline ConvolutionOperatorTester& dilation(uint32_t dilation_height, uint32_t dilation_width) {
252 assert(dilation_height >= 1);
253 assert(dilation_width >= 1);
254 this->dilation_height_ = dilation_height;
255 this->dilation_width_ = dilation_width;
256 return *this;
257 }
258
259 inline ConvolutionOperatorTester& dilation_height(uint32_t dilation_height) {
260 assert(dilation_height >= 1);
261 this->dilation_height_ = dilation_height;
262 return *this;
263 }
264
265 inline uint32_t dilation_height() const {
266 return this->dilation_height_;
267 }
268
269 inline ConvolutionOperatorTester& dilation_width(uint32_t dilation_width) {
270 assert(dilation_width >= 1);
271 this->dilation_width_ = dilation_width;
272 return *this;
273 }
274
275 inline uint32_t dilation_width() const {
276 return this->dilation_width_;
277 }
278
279 inline ConvolutionOperatorTester& subsampling(uint32_t subsampling) {
280 assert(subsampling >= 1);
281 this->subsampling_height_ = subsampling;
282 this->subsampling_width_ = subsampling;
283 return *this;
284 }
285
286 inline ConvolutionOperatorTester& subsampling(uint32_t subsampling_height, uint32_t subsampling_width) {
287 assert(subsampling_height >= 1);
288 assert(subsampling_width >= 1);
289 this->subsampling_height_ = subsampling_height;
290 this->subsampling_width_ = subsampling_width;
291 return *this;
292 }
293
294 inline ConvolutionOperatorTester& subsampling_height(uint32_t subsampling_height) {
295 assert(subsampling_height >= 1);
296 this->subsampling_height_ = subsampling_height;
297 return *this;
298 }
299
300 inline uint32_t subsampling_height() const {
301 return this->subsampling_height_;
302 }
303
304 inline ConvolutionOperatorTester& subsampling_width(uint32_t subsampling_width) {
305 assert(subsampling_width >= 1);
306 this->subsampling_width_ = subsampling_width;
307 return *this;
308 }
309
310 inline uint32_t subsampling_width() const {
311 return this->subsampling_width_;
312 }
313
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700314 inline ConvolutionOperatorTester& input_channel_stride(size_t input_channel_stride) {
315 assert(input_channel_stride >= 1);
316 this->input_channel_stride_ = input_channel_stride;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700317 return *this;
318 }
319
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700320 inline size_t input_channel_stride() const {
321 if (this->input_channel_stride_ == 0) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700322 return group_input_channels() * groups();
323 } else {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700324 assert(this->input_channel_stride_ >= group_input_channels() * groups());
325 return this->input_channel_stride_;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700326 }
327 }
328
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700329 inline ConvolutionOperatorTester& output_channel_stride(size_t output_channel_stride) {
330 assert(output_channel_stride >= 1);
331 this->output_channel_stride_ = output_channel_stride;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700332 return *this;
333 }
334
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700335 inline size_t output_channel_stride() const {
336 if (this->output_channel_stride_ == 0) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700337 return group_output_channels() * groups();
338 } else {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700339 assert(this->output_channel_stride_ >= group_output_channels() * groups());
340 return this->output_channel_stride_;
Marat Dukhanefc47b82019-11-18 09:25:38 -0800341 }
342 }
343
XNNPACK Teamb455b122019-09-27 18:10:33 -0700344 inline uint32_t dilated_kernel_height() const {
345 return (kernel_height() - 1) * dilation_height() + 1;
346 }
347
348 inline uint32_t dilated_kernel_width() const {
349 return (kernel_width() - 1) * dilation_width() + 1;
350 }
351
352 inline size_t output_height() const {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700353 if (padding_tf_same()) {
354 return (input_height() + subsampling_height() - 1) / subsampling_height();
XNNPACK Teamb455b122019-09-27 18:10:33 -0700355 } else {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700356 const size_t padded_input_height = padding_top() + input_height() + padding_bottom();
357 if (padded_input_height <= dilated_kernel_height()) {
358 return 1;
359 } else {
360 return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1;
361 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700362 }
363 }
364
365 inline size_t output_width() const {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700366 if (padding_tf_same()) {
367 return (input_width() + subsampling_width() - 1) / subsampling_width();
XNNPACK Teamb455b122019-09-27 18:10:33 -0700368 } else {
Marat Dukhan8440fde2019-10-24 12:46:13 -0700369 const size_t padded_input_width = padding_left() + input_width() + padding_right();
370 if (padded_input_width <= dilated_kernel_width()) {
371 return 1;
372 } else {
373 return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1;
374 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700375 }
376 }
377
378 inline ConvolutionOperatorTester& next_input_size(uint32_t next_input_height, uint32_t next_input_width) {
379 assert(next_input_height >= 1);
380 assert(next_input_width >= 1);
381 this->next_input_height_ = next_input_height;
382 this->next_input_width_ = next_input_width;
383 return *this;
384 }
385
386 inline ConvolutionOperatorTester& next_input_height(uint32_t next_input_height) {
387 assert(next_input_height >= 1);
388 this->next_input_height_ = next_input_height;
389 return *this;
390 }
391
392 inline uint32_t next_input_height() const {
393 if (this->next_input_height_ == 0) {
394 return input_height();
395 } else {
396 return this->next_input_height_;
397 }
398 }
399
400 inline ConvolutionOperatorTester& next_input_width(uint32_t next_input_width) {
401 assert(next_input_width >= 1);
402 this->next_input_width_ = next_input_width;
403 return *this;
404 }
405
406 inline uint32_t next_input_width() const {
407 if (this->next_input_width_ == 0) {
408 return input_width();
409 } else {
410 return this->next_input_width_;
411 }
412 }
413
414 inline size_t next_output_height() const {
415 const size_t padded_input_height = padding_top() + next_input_height() + padding_bottom();
416 if (padded_input_height <= dilated_kernel_height()) {
417 return 1;
418 } else {
419 return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1;
420 }
421 }
422
423 inline size_t next_output_width() const {
424 const size_t padded_input_width = padding_left() + next_input_width() + padding_right();
425 if (padded_input_width <= dilated_kernel_width()) {
426 return 1;
427 } else {
428 return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1;
429 }
430 }
431
432 inline ConvolutionOperatorTester& next_batch_size(size_t next_batch_size) {
433 assert(next_batch_size >= 1);
434 this->next_batch_size_ = next_batch_size;
435 return *this;
436 }
437
438 inline size_t next_batch_size() const {
439 if (this->next_batch_size_ == 0) {
440 return batch_size();
441 } else {
442 return this->next_batch_size_;
443 }
444 }
445
Marat Dukhanefc47b82019-11-18 09:25:38 -0800446 inline ConvolutionOperatorTester& sparsity(float sparsity) {
447 this->sparsity_ = sparsity;
448 return *this;
449 }
450
451 inline float sparsity() const {
452 return this->sparsity_;
453 }
454
XNNPACK Teamb455b122019-09-27 18:10:33 -0700455 inline ConvolutionOperatorTester& qmin(uint8_t qmin) {
456 this->qmin_ = qmin;
457 return *this;
458 }
459
460 inline uint8_t qmin() const {
461 return this->qmin_;
462 }
463
464 inline ConvolutionOperatorTester& qmax(uint8_t qmax) {
465 this->qmax_ = qmax;
466 return *this;
467 }
468
469 inline uint8_t qmax() const {
470 return this->qmax_;
471 }
472
Marat Dukhanefc47b82019-11-18 09:25:38 -0800473 inline ConvolutionOperatorTester& force_nhwc_input(bool force_nhwc_input) {
474 this->force_nhwc_input_ = force_nhwc_input;
475 return *this;
476 }
477
478 inline bool force_nhwc_input() const {
479 return this->force_nhwc_input_;
480 }
481
XNNPACK Teamb455b122019-09-27 18:10:33 -0700482 inline ConvolutionOperatorTester& depthwise_layout(bool depthwise_layout) {
483 this->depthwise_layout_ = depthwise_layout;
484 return *this;
485 }
486
487 inline bool depthwise_layout() const {
488 return this->depthwise_layout_;
489 }
490
Marat Dukhanf568f082019-10-30 09:47:07 -0700491 inline ConvolutionOperatorTester& has_bias(bool has_bias) {
492 this->has_bias_ = has_bias;
493 return *this;
494 }
495
496 inline bool has_bias() const {
497 return this->has_bias_;
498 }
499
XNNPACK Teamb455b122019-09-27 18:10:33 -0700500 inline ConvolutionOperatorTester& iterations(size_t iterations) {
501 this->iterations_ = iterations;
502 return *this;
503 }
504
505 inline size_t iterations() const {
506 return this->iterations_;
507 }
508
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700509 void TestNHWCxQS8() const {
510 std::random_device random_device;
511 auto rng = std::mt19937(random_device());
512 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
513 auto i8rng = std::bind(
514 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), rng);
515
516 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) +
517 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()) + 8);
518 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
519 std::vector<int32_t> bias(groups() * group_output_channels());
520 std::vector<int8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
521 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
522 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
523
524 const int8_t input_zero_point = -1;
525
526 for (size_t iteration = 0; iteration < iterations(); iteration++) {
527 std::generate(input.begin(), input.end(), std::ref(i8rng));
528 std::generate(kernel.begin(), kernel.end(), std::ref(i8rng));
529 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
530 std::fill(output.begin(), output.end(), 0xA5);
531
532 // Compute reference results, without renormalization.
533 if (has_bias()) {
534 for (size_t i = 0; i < batch_size(); i++) {
535 for (size_t oy = 0; oy < output_height(); oy++) {
536 for (size_t ox = 0; ox < output_width(); ox++) {
537 for (size_t g = 0; g < groups(); g++) {
538 for (size_t oc = 0; oc < group_output_channels(); oc++) {
539 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
540 bias[g * group_output_channels() + oc];
541 }
542 }
543 }
544 }
545 }
546 } else {
547 std::fill(accumulators.begin(), accumulators.end(), 0);
548 }
549 if (depthwise_layout()) {
550 ASSERT_EQ(group_input_channels(), 1);
551
552 for (size_t i = 0; i < batch_size(); i++) {
553 for (size_t oy = 0; oy < output_height(); oy++) {
554 for (size_t ox = 0; ox < output_width(); ox++) {
555 for (size_t ky = 0; ky < kernel_height(); ky++) {
556 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
557 if (iy < input_height()) {
558 for (size_t kx = 0; kx < kernel_width(); kx++) {
559 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
560 if (ix < input_width()) {
561 for (size_t g = 0; g < groups(); g++) {
562 for (size_t oc = 0; oc < group_output_channels(); oc++) {
563 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
564 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g]) - int32_t(input_zero_point)) *
565 int32_t(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]);
566 }
567 }
568 }
569 }
570 }
571 }
572 }
573 }
574 }
575 } else {
576 for (size_t i = 0; i < batch_size(); i++) {
577 for (size_t oy = 0; oy < output_height(); oy++) {
578 for (size_t ox = 0; ox < output_width(); ox++) {
579 for (size_t ky = 0; ky < kernel_height(); ky++) {
580 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
581 if (iy < input_height()) {
582 for (size_t kx = 0; kx < kernel_width(); kx++) {
583 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
584 if (ix < input_width()) {
585 for (size_t g = 0; g < groups(); g++) {
586 for (size_t oc = 0; oc < group_output_channels(); oc++) {
587 for (size_t ic = 0; ic < group_input_channels(); ic++) {
588 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
589 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
590 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
591 }
592 }
593 }
594 }
595 }
596 }
597 }
598 }
599 }
600 }
601 }
602
603 // Compute renormalization parameters.
604 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
605 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
606
607 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
608 const int8_t output_zero_point = int8_t(std::max(std::min(
609 lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
610 long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
611
612 // Renormalize reference results.
613 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
614 [this, output_scale, output_zero_point](int32_t x) -> double {
615 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
616 });
617
618 // Create, setup, run, and destroy Convolution operator.
619 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
620 xnn_operator_t convolution_op = nullptr;
621
622 xnn_status status = xnn_create_convolution2d_nhwc_qs8(
623 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
624 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
625 kernel_height(), kernel_width(),
626 subsampling_height(), subsampling_width(),
627 dilation_height(), dilation_width(),
628 groups(), group_input_channels(), group_output_channels(),
629 input_channel_stride(), output_channel_stride(),
630 input_zero_point, 1.0f /* input scale */, 1.0f /* kernel scale */,
631 kernel.data(), has_bias() ? bias.data() : nullptr,
632 output_zero_point, output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
633 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
634 &convolution_op);
635 if (status == xnn_status_unsupported_hardware) {
636 GTEST_SKIP();
637 }
638 ASSERT_EQ(xnn_status_success, status);
639 ASSERT_NE(nullptr, convolution_op);
640
641 // Smart pointer to automatically delete convolution_op.
642 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
643
644 ASSERT_EQ(xnn_status_success,
645 xnn_setup_convolution2d_nhwc_qs8(
646 convolution_op,
647 batch_size(), input_height(), input_width(),
648 input.data(), output.data(),
649 nullptr /* thread pool */));
650
651 ASSERT_EQ(xnn_status_success,
652 xnn_run_operator(convolution_op, nullptr /* thread pool */));
653
654 // Verify results.
655 for (size_t i = 0; i < batch_size(); i++) {
656 for (size_t y = 0; y < output_height(); y++) {
657 for (size_t x = 0; x < output_width(); x++) {
658 for (size_t g = 0; g < groups(); g++) {
659 for (size_t c = 0; c < group_output_channels(); c++) {
660 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
661 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
662 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
663 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
664 ASSERT_NEAR(
665 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
666 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
667 0.9)
668 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
669 }
670 }
671 }
672 }
673 }
674 }
675 }
676
Marat Dukhan08b7a972020-07-14 18:17:29 -0700677 void TestNHWCxQU8() const {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700678 std::random_device random_device;
679 auto rng = std::mt19937(random_device());
Marat Dukhanecd83112020-08-03 21:50:28 -0700680 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
Marat Dukhan5ce30d92020-04-14 03:31:26 -0700681 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700682
683 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700684 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()) + 8);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700685 std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
686 std::vector<int32_t> bias(groups() * group_output_channels());
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700687 std::vector<uint8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700688 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
689 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
690
691 const uint8_t input_zero_point = 127;
692 const uint8_t kernel_zero_point = 127;
693
694 for (size_t iteration = 0; iteration < iterations(); iteration++) {
695 std::generate(input.begin(), input.end(), std::ref(u8rng));
696 std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
Marat Dukhanecd83112020-08-03 21:50:28 -0700697 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700698 std::fill(output.begin(), output.end(), 0xA5);
699
700 // Compute reference results, without renormalization.
Marat Dukhanf568f082019-10-30 09:47:07 -0700701 if (has_bias()) {
702 for (size_t i = 0; i < batch_size(); i++) {
703 for (size_t oy = 0; oy < output_height(); oy++) {
704 for (size_t ox = 0; ox < output_width(); ox++) {
705 for (size_t g = 0; g < groups(); g++) {
706 for (size_t oc = 0; oc < group_output_channels(); oc++) {
707 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
708 bias[g * group_output_channels() + oc];
709 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700710 }
711 }
712 }
713 }
Marat Dukhanf568f082019-10-30 09:47:07 -0700714 } else {
715 std::fill(accumulators.begin(), accumulators.end(), 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700716 }
717 if (depthwise_layout()) {
718 ASSERT_EQ(group_input_channels(), 1);
719
720 for (size_t i = 0; i < batch_size(); i++) {
721 for (size_t oy = 0; oy < output_height(); oy++) {
722 for (size_t ox = 0; ox < output_width(); ox++) {
723 for (size_t ky = 0; ky < kernel_height(); ky++) {
724 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
725 if (iy < input_height()) {
726 for (size_t kx = 0; kx < kernel_width(); kx++) {
727 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
728 if (ix < input_width()) {
729 for (size_t g = 0; g < groups(); g++) {
730 for (size_t oc = 0; oc < group_output_channels(); oc++) {
731 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700732 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g]) - int32_t(input_zero_point)) *
XNNPACK Teamb455b122019-09-27 18:10:33 -0700733 (int32_t(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]) - int32_t(kernel_zero_point));
734 }
735 }
736 }
737 }
738 }
739 }
740 }
741 }
742 }
743 } else {
744 for (size_t i = 0; i < batch_size(); i++) {
745 for (size_t oy = 0; oy < output_height(); oy++) {
746 for (size_t ox = 0; ox < output_width(); ox++) {
747 for (size_t ky = 0; ky < kernel_height(); ky++) {
748 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
749 if (iy < input_height()) {
750 for (size_t kx = 0; kx < kernel_width(); kx++) {
751 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
752 if (ix < input_width()) {
753 for (size_t g = 0; g < groups(); g++) {
754 for (size_t oc = 0; oc < group_output_channels(); oc++) {
755 for (size_t ic = 0; ic < group_input_channels(); ic++) {
756 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700757 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
XNNPACK Teamb455b122019-09-27 18:10:33 -0700758 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
759 }
760 }
761 }
762 }
763 }
764 }
765 }
766 }
767 }
768 }
769 }
770
771 // Compute renormalization parameters.
772 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
773 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
774
775 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
776 const uint8_t output_zero_point = uint8_t(std::max(std::min(
777 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
778 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
779
780 // Renormalize reference results.
781 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
782 [this, output_scale, output_zero_point](int32_t x) -> double {
783 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
784 });
785
786 // Create, setup, run, and destroy Convolution operator.
Marat Dukhan04f03be2019-11-19 12:36:47 -0800787 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700788 xnn_operator_t convolution_op = nullptr;
789
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700790 xnn_status status = xnn_create_convolution2d_nhwc_qu8(
Marat Dukhan8440fde2019-10-24 12:46:13 -0700791 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
792 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700793 kernel_height(), kernel_width(),
794 subsampling_height(), subsampling_width(),
795 dilation_height(), dilation_width(),
796 groups(), group_input_channels(), group_output_channels(),
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700797 input_channel_stride(), output_channel_stride(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700798 input_zero_point, 1.0f /* input scale */,
799 kernel_zero_point, 1.0f /* kernel scale */,
Marat Dukhanf568f082019-10-30 09:47:07 -0700800 kernel.data(), has_bias() ? bias.data() : nullptr,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700801 output_zero_point, output_scale, qmin(), qmax(),
Marat Dukhan8440fde2019-10-24 12:46:13 -0700802 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700803 &convolution_op);
804 if (status == xnn_status_unsupported_hardware) {
805 GTEST_SKIP();
806 }
807 ASSERT_EQ(xnn_status_success, status);
808 ASSERT_NE(nullptr, convolution_op);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700809
810 // Smart pointer to automatically delete convolution_op.
811 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
812
813 ASSERT_EQ(xnn_status_success,
Marat Dukhan08b7a972020-07-14 18:17:29 -0700814 xnn_setup_convolution2d_nhwc_qu8(
XNNPACK Teamb455b122019-09-27 18:10:33 -0700815 convolution_op,
816 batch_size(), input_height(), input_width(),
817 input.data(), output.data(),
818 nullptr /* thread pool */));
819
820 ASSERT_EQ(xnn_status_success,
821 xnn_run_operator(convolution_op, nullptr /* thread pool */));
822
823 // Verify results.
824 for (size_t i = 0; i < batch_size(); i++) {
825 for (size_t y = 0; y < output_height(); y++) {
826 for (size_t x = 0; x < output_width(); x++) {
827 for (size_t g = 0; g < groups(); g++) {
828 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700829 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
XNNPACK Teamb455b122019-09-27 18:10:33 -0700830 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700831 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
XNNPACK Teamb455b122019-09-27 18:10:33 -0700832 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
833 ASSERT_NEAR(
834 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700835 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700836 0.9)
837 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
838 }
839 }
840 }
841 }
842 }
843 }
844 }
845
Marat Dukhanefc47b82019-11-18 09:25:38 -0800846 void TestNHWCxF32() const {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700847 std::random_device random_device;
848 auto rng = std::mt19937(random_device());
849 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
850
851 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700852 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700853 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
854 std::vector<float> bias(groups() * group_output_channels());
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700855 std::vector<float> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700856 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
857
858 for (size_t iteration = 0; iteration < iterations(); iteration++) {
859 std::generate(input.begin(), input.end(), std::ref(f32rng));
860 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
861 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
862 std::fill(output.begin(), output.end(), nanf(""));
863
864 // Compute reference results, without clamping.
Marat Dukhanf568f082019-10-30 09:47:07 -0700865 if (has_bias()) {
866 for (size_t i = 0; i < batch_size(); i++) {
867 for (size_t oy = 0; oy < output_height(); oy++) {
868 for (size_t ox = 0; ox < output_width(); ox++) {
869 for (size_t g = 0; g < groups(); g++) {
870 for (size_t oc = 0; oc < group_output_channels(); oc++) {
871 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
872 bias[g * group_output_channels() + oc];
873 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700874 }
875 }
876 }
877 }
Marat Dukhanf568f082019-10-30 09:47:07 -0700878 } else {
879 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700880 }
881 if (depthwise_layout()) {
882 ASSERT_EQ(group_input_channels(), 1);
883
884 for (size_t i = 0; i < batch_size(); i++) {
885 for (size_t oy = 0; oy < output_height(); oy++) {
886 for (size_t ox = 0; ox < output_width(); ox++) {
887 for (size_t ky = 0; ky < kernel_height(); ky++) {
888 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
889 if (iy < input_height()) {
890 for (size_t kx = 0; kx < kernel_width(); kx++) {
891 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
892 if (ix < input_width()) {
893 for (size_t g = 0; g < groups(); g++) {
894 for (size_t oc = 0; oc < group_output_channels(); oc++) {
895 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700896 input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g] *
XNNPACK Teamb455b122019-09-27 18:10:33 -0700897 kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc];
898 }
899 }
900 }
901 }
902 }
903 }
904 }
905 }
906 }
907 } else {
908 for (size_t i = 0; i < batch_size(); i++) {
909 for (size_t oy = 0; oy < output_height(); oy++) {
910 for (size_t ox = 0; ox < output_width(); ox++) {
911 for (size_t ky = 0; ky < kernel_height(); ky++) {
912 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
913 if (iy < input_height()) {
914 for (size_t kx = 0; kx < kernel_width(); kx++) {
915 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
916 if (ix < input_width()) {
917 for (size_t g = 0; g < groups(); g++) {
918 for (size_t oc = 0; oc < group_output_channels(); oc++) {
919 for (size_t ic = 0; ic < group_input_channels(); ic++) {
920 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700921 input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] *
XNNPACK Teamb455b122019-09-27 18:10:33 -0700922 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
923 }
924 }
925 }
926 }
927 }
928 }
929 }
930 }
931 }
932 }
933 }
934
935 // Compute clamping parameters.
936 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
937 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
938
939 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
940 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
941
942 // Clamp reference results.
943 for (float& value : output_ref) {
944 value = std::max(std::min(value, output_max), output_min);
945 }
946
947 // Create, setup, run, and destroy Convolution operator.
Marat Dukhan04f03be2019-11-19 12:36:47 -0800948 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700949 xnn_operator_t convolution_op = nullptr;
950
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700951 xnn_status status = xnn_create_convolution2d_nhwc_f32(
Marat Dukhan8440fde2019-10-24 12:46:13 -0700952 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
953 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
XNNPACK Teamb455b122019-09-27 18:10:33 -0700954 kernel_height(), kernel_width(),
955 subsampling_height(), subsampling_width(),
956 dilation_height(), dilation_width(),
957 groups(), group_input_channels(), group_output_channels(),
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700958 input_channel_stride(), output_channel_stride(),
Marat Dukhanf568f082019-10-30 09:47:07 -0700959 kernel.data(), has_bias() ? bias.data() : nullptr,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700960 output_min, output_max,
Marat Dukhan8440fde2019-10-24 12:46:13 -0700961 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
Marat Dukhan16f1e1a2020-08-04 16:38:22 -0700962 &convolution_op);
963 if (status == xnn_status_unsupported_hardware) {
964 GTEST_SKIP();
965 }
966 ASSERT_EQ(xnn_status_success, status);
967 ASSERT_NE(nullptr, convolution_op);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700968
969 // Smart pointer to automatically delete convolution_op.
970 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
971
972 ASSERT_EQ(xnn_status_success,
973 xnn_setup_convolution2d_nhwc_f32(
974 convolution_op,
975 batch_size(), input_height(), input_width(),
976 input.data(), output.data(),
977 nullptr /* thread pool */));
978
979 ASSERT_EQ(xnn_status_success,
980 xnn_run_operator(convolution_op, nullptr /* thread pool */));
981
982 // Verify results.
983 for (size_t i = 0; i < batch_size(); i++) {
984 for (size_t y = 0; y < output_height(); y++) {
985 for (size_t x = 0; x < output_width(); x++) {
986 for (size_t g = 0; g < groups(); g++) {
987 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700988 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700989 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700990 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700991 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
992 ASSERT_NEAR(
993 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -0700994 output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c],
XNNPACK Teamb455b122019-09-27 18:10:33 -0700995 1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
996 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
997 }
998 }
999 }
1000 }
1001 }
1002 }
1003 }
1004
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001005 void TestNHWCxF16() const {
1006 std::random_device random_device;
1007 auto rng = std::mt19937(random_device());
Frank Barchard7d2c1f22020-09-14 16:43:53 -07001008 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001009 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
1010
1011 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) +
1012 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()));
1013 std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1014 std::vector<uint16_t> bias(groups() * group_output_channels());
1015 std::vector<uint16_t> output(batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()));
1016 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1017
1018 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1019 std::generate(input.begin(), input.end(), std::ref(f16rng));
1020 std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
1021 std::generate(bias.begin(), bias.end(), std::ref(f16rng));
1022 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
1023
1024 // Compute reference results, without clamping.
1025 if (has_bias()) {
1026 for (size_t i = 0; i < batch_size(); i++) {
1027 for (size_t oy = 0; oy < output_height(); oy++) {
1028 for (size_t ox = 0; ox < output_width(); ox++) {
1029 for (size_t g = 0; g < groups(); g++) {
1030 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1031 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1032 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
1033 }
1034 }
1035 }
1036 }
1037 }
1038 } else {
1039 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1040 }
1041 if (depthwise_layout()) {
1042 ASSERT_EQ(group_input_channels(), 1);
1043
1044 for (size_t i = 0; i < batch_size(); i++) {
1045 for (size_t oy = 0; oy < output_height(); oy++) {
1046 for (size_t ox = 0; ox < output_width(); ox++) {
1047 for (size_t ky = 0; ky < kernel_height(); ky++) {
1048 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1049 if (iy < input_height()) {
1050 for (size_t kx = 0; kx < kernel_width(); kx++) {
1051 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1052 if (ix < input_width()) {
1053 for (size_t g = 0; g < groups(); g++) {
1054 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1055 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1056 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g]) *
1057 fp16_ieee_to_fp32_value(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]);
1058 }
1059 }
1060 }
1061 }
1062 }
1063 }
1064 }
1065 }
1066 }
1067 } else {
1068 for (size_t i = 0; i < batch_size(); i++) {
1069 for (size_t oy = 0; oy < output_height(); oy++) {
1070 for (size_t ox = 0; ox < output_width(); ox++) {
1071 for (size_t ky = 0; ky < kernel_height(); ky++) {
1072 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1073 if (iy < input_height()) {
1074 for (size_t kx = 0; kx < kernel_width(); kx++) {
1075 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1076 if (ix < input_width()) {
1077 for (size_t g = 0; g < groups(); g++) {
1078 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1079 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1080 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1081 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) *
1082 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1083 }
1084 }
1085 }
1086 }
1087 }
1088 }
1089 }
1090 }
1091 }
1092 }
1093 }
1094
1095 // Compute clamping parameters.
1096 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1097 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1098 const float accumulated_range = accumulated_max - accumulated_min;
1099 const float scaled_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
1100 const float scaled_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
1101 const float output_min = scaled_min == scaled_max ? -std::numeric_limits<float>::infinity() : scaled_min;
1102 const float output_max = scaled_min == scaled_max ? +std::numeric_limits<float>::infinity() : scaled_max;
1103
1104 // Clamp reference results.
1105 for (float& value : output_ref) {
1106 value = std::max(std::min(value, output_max), output_min);
1107 }
1108
1109 // Create, setup, run, and destroy Convolution operator.
1110 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1111 xnn_operator_t convolution_op = nullptr;
1112
1113 xnn_status status = xnn_create_convolution2d_nhwc_f16(
1114 padding_tf_same() ? 0 : padding_top(), padding_tf_same() ? 0 : padding_right(),
1115 padding_tf_same() ? 0 : padding_bottom(), padding_tf_same() ? 0 : padding_left(),
1116 kernel_height(), kernel_width(),
1117 subsampling_height(), subsampling_width(),
1118 dilation_height(), dilation_width(),
1119 groups(), group_input_channels(), group_output_channels(),
1120 input_channel_stride(), output_channel_stride(),
1121 kernel.data(), has_bias() ? bias.data() : nullptr,
1122 output_min, output_max,
1123 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (padding_tf_same() ? XNN_FLAG_TENSORFLOW_SAME_PADDING : 0),
1124 &convolution_op);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001125 if (status == xnn_status_unsupported_hardware) {
1126 GTEST_SKIP();
1127 }
1128 ASSERT_EQ(xnn_status_success, status);
1129 ASSERT_NE(nullptr, convolution_op);
1130
1131 // Smart pointer to automatically delete convolution_op.
1132 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1133
1134 ASSERT_EQ(xnn_status_success,
1135 xnn_setup_convolution2d_nhwc_f16(
1136 convolution_op,
1137 batch_size(), input_height(), input_width(),
1138 input.data(), output.data(),
1139 nullptr /* thread pool */));
1140
1141 ASSERT_EQ(xnn_status_success,
1142 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1143
1144 // Verify results.
1145 for (size_t i = 0; i < batch_size(); i++) {
1146 for (size_t y = 0; y < output_height(); y++) {
1147 for (size_t x = 0; x < output_width(); x++) {
1148 for (size_t g = 0; g < groups(); g++) {
1149 for (size_t c = 0; c < group_output_channels(); c++) {
1150// ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min)
1151// << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1152// ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max)
1153// << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Frank Barchard2b9d29b2020-09-17 12:03:39 -07001154 ASSERT_NEAR(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), std::max(1.0e-4f, std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]) * 1.0e-2f))
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001155 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1156 }
1157 }
1158 }
1159 }
1160 }
1161 }
1162 }
1163
Marat Dukhanefc47b82019-11-18 09:25:38 -08001164 void TestNCHWxF32() const {
Marat Dukhanefc47b82019-11-18 09:25:38 -08001165 std::random_device random_device;
1166 auto rng = std::mt19937(random_device());
1167 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
1168 auto prng = std::bind(std::uniform_real_distribution<float>(), rng);
1169
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001170 std::vector<float> input(2 * XNN_EXTRA_BYTES / sizeof(float) +
1171 ((batch_size() - 1) * input_channel_stride() + groups() * group_input_channels()) * input_height() * input_width());
Marat Dukhanefc47b82019-11-18 09:25:38 -08001172 std::vector<float> kernel(
1173 groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1174 std::vector<float> bias(groups() * group_output_channels());
1175 std::vector<float> output(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001176 ((batch_size() - 1) * output_channel_stride() + groups() * group_output_channels()) * output_height() * output_width());
Marat Dukhanefc47b82019-11-18 09:25:38 -08001177 std::vector<float> output_ref(batch_size() * groups() * group_output_channels() * output_height() * output_width());
1178
1179 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1180 std::generate(input.begin(), input.end(), std::ref(f32rng));
1181 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
1182 for (float& k : kernel) {
1183 if (prng() <= sparsity()) {
1184 k = 0.0f;
1185 }
1186 }
1187 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
1188 std::fill(output.begin(), output.end(), nanf(""));
1189
1190 // Compute reference results, without clamping.
1191 if (has_bias()) {
1192 for (size_t i = 0; i < batch_size(); i++) {
1193 for (size_t oy = 0; oy < output_height(); oy++) {
1194 for (size_t ox = 0; ox < output_width(); ox++) {
1195 for (size_t g = 0; g < groups(); g++) {
1196 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1197 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] =
1198 bias[g * group_output_channels() + oc];
1199 }
1200 }
1201 }
1202 }
1203 }
1204 } else {
1205 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1206 }
1207 if (force_nhwc_input()) {
1208 for (size_t i = 0; i < batch_size(); i++) {
1209 for (size_t oy = 0; oy < output_height(); oy++) {
1210 for (size_t ox = 0; ox < output_width(); ox++) {
1211 for (size_t ky = 0; ky < kernel_height(); ky++) {
1212 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1213 if (iy < input_height()) {
1214 for (size_t kx = 0; kx < kernel_width(); kx++) {
1215 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1216 if (ix < input_width()) {
1217 for (size_t g = 0; g < groups(); g++) {
1218 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1219 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1220 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] +=
1221 input[((((i * input_height() + iy) * input_width() + ix) * groups() + g) * group_input_channels() + ic)] *
1222 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1223 }
1224 }
1225 }
1226 }
1227 }
1228 }
1229 }
1230 }
1231 }
1232 }
Marat Dukhan33032712020-06-18 11:06:04 -07001233 } else if (depthwise_layout()) {
1234 ASSERT_EQ(group_input_channels(), 1);
1235
1236 for (size_t i = 0; i < batch_size(); i++) {
1237 for (size_t oy = 0; oy < output_height(); oy++) {
1238 for (size_t ox = 0; ox < output_width(); ox++) {
1239 for (size_t ky = 0; ky < kernel_height(); ky++) {
1240 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1241 if (iy < input_height()) {
1242 for (size_t kx = 0; kx < kernel_width(); kx++) {
1243 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1244 if (ix < input_width()) {
1245 for (size_t g = 0; g < groups(); g++) {
1246 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1247 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] +=
1248 input[((i * input_channel_stride() + g) * input_height() + iy) * input_width() + ix] *
1249 kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc];
1250 }
1251 }
1252 }
1253 }
1254 }
1255 }
1256 }
1257 }
1258 }
Marat Dukhanefc47b82019-11-18 09:25:38 -08001259 } else {
1260 for (size_t i = 0; i < batch_size(); i++) {
1261 for (size_t oy = 0; oy < output_height(); oy++) {
1262 for (size_t ox = 0; ox < output_width(); ox++) {
1263 for (size_t ky = 0; ky < kernel_height(); ky++) {
1264 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1265 if (iy < input_height()) {
1266 for (size_t kx = 0; kx < kernel_width(); kx++) {
1267 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1268 if (ix < input_width()) {
1269 for (size_t g = 0; g < groups(); g++) {
1270 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1271 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1272 output_ref[(((i * groups() + g) * group_output_channels() + oc) * output_height() + oy) * output_width() + ox] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001273 input[((i * input_channel_stride() + g * group_input_channels() + ic) * input_height() + iy) * input_width() + ix] *
Marat Dukhanefc47b82019-11-18 09:25:38 -08001274 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1275 }
1276 }
1277 }
1278 }
1279 }
1280 }
1281 }
1282 }
1283 }
1284 }
1285 }
1286
1287 // Compute clamping parameters.
1288 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1289 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1290
Marat Dukhan869c62d2020-04-09 17:17:55 -07001291 const float output_min = qmin() == 0 ? -std::numeric_limits<float>::infinity() :
1292 accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
1293 const float output_max = qmax() == 255 ? std::numeric_limits<float>::infinity() :
1294 accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
Marat Dukhanefc47b82019-11-18 09:25:38 -08001295
1296 // Clamp reference results.
1297 for (float& value : output_ref) {
1298 value = std::max(std::min(value, output_max), output_min);
1299 }
1300
1301 // Create, setup, run, and destroy Convolution operator.
Marat Dukhan04f03be2019-11-19 12:36:47 -08001302 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
Marat Dukhanefc47b82019-11-18 09:25:38 -08001303 xnn_operator_t convolution_op = nullptr;
1304
1305 xnn_status status = xnn_create_convolution2d_nchw_f32(
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001306 padding_top(), padding_right(), padding_bottom(), padding_left(),
1307 kernel_height(), kernel_width(),
1308 subsampling_height(), subsampling_width(),
1309 dilation_height(), dilation_width(),
1310 groups(), group_input_channels(), group_output_channels(),
1311 input_channel_stride(), output_channel_stride(),
1312 kernel.data(), has_bias() ? bias.data() : nullptr,
1313 output_min, output_max,
1314 (depthwise_layout() ? XNN_FLAG_DEPTHWISE_CONVOLUTION : 0) | (force_nhwc_input() ? XNN_FLAG_INPUT_NHWC : 0),
1315 &convolution_op);
Marat Dukhanefc47b82019-11-18 09:25:38 -08001316 if (status == xnn_status_unsupported_parameter) {
1317 GTEST_SKIP();
1318 }
1319 ASSERT_EQ(xnn_status_success, status);
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001320 ASSERT_NE(nullptr, convolution_op);
Marat Dukhanefc47b82019-11-18 09:25:38 -08001321
1322 // Smart pointer to automatically delete convolution_op.
1323 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1324
1325 ASSERT_EQ(xnn_status_success,
1326 xnn_setup_convolution2d_nchw_f32(
1327 convolution_op,
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001328 batch_size(), input_height(), input_width(),
Marat Dukhanefc47b82019-11-18 09:25:38 -08001329 input.data(), output.data(),
1330 nullptr /* thread pool */));
1331
1332 ASSERT_EQ(xnn_status_success,
1333 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1334
1335 // Verify results.
1336 for (size_t i = 0; i < batch_size(); i++) {
1337 for (size_t y = 0; y < output_height(); y++) {
1338 for (size_t x = 0; x < output_width(); x++) {
1339 for (size_t g = 0; g < groups(); g++) {
1340 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001341 ASSERT_GE(output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x], output_min)
Marat Dukhanefc47b82019-11-18 09:25:38 -08001342 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001343 ASSERT_LE(output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x], output_max)
Marat Dukhanefc47b82019-11-18 09:25:38 -08001344 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i;
1345 ASSERT_NEAR(
1346 output_ref[(((i * groups() + g) * group_output_channels() + c) * output_height() + y) * output_width() + x],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001347 output[((i * output_channel_stride() + g * group_output_channels() + c) * output_height() + y) * output_width() + x],
Marat Dukhanefc47b82019-11-18 09:25:38 -08001348 1.0e-4 * std::abs(output_ref[(((i * groups() + g) * group_output_channels() + c) * output_height() + y) * output_width() + x]))
1349 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c << ", image = " << i;
1350 }
1351 }
1352 }
1353 }
1354 }
1355 }
1356 }
1357
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001358 void TestSetupNHWCxQS8() const {
1359 ASSERT_FALSE(depthwise_layout());
1360
1361 std::random_device random_device;
1362 auto rng = std::mt19937(random_device());
1363 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
1364 auto i8rng = std::bind(
1365 std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), rng);
1366
1367 std::vector<int8_t> input(XNN_EXTRA_BYTES / sizeof(int8_t) + std::max(
1368 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
1369 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())) + 8);
1370 std::vector<int8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1371 std::vector<int32_t> bias(groups() * group_output_channels());
1372 std::vector<int8_t> output(std::max(
1373 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
1374 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
1375 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1376 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1377 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1378 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1379
1380 const int8_t input_zero_point = -1;
1381
1382 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1383 std::generate(input.begin(), input.end(), std::ref(i8rng));
1384 std::generate(kernel.begin(), kernel.end(), std::ref(i8rng));
1385 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
1386 std::fill(output.begin(), output.end(), 0xA5);
1387
1388 // Compute reference results, without renormalization.
1389 if (has_bias()) {
1390 for (size_t i = 0; i < batch_size(); i++) {
1391 for (size_t oy = 0; oy < output_height(); oy++) {
1392 for (size_t ox = 0; ox < output_width(); ox++) {
1393 for (size_t g = 0; g < groups(); g++) {
1394 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1395 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1396 bias[g * group_output_channels() + oc];
1397 }
1398 }
1399 }
1400 }
1401 }
1402 } else {
1403 std::fill(accumulators.begin(), accumulators.end(), 0);
1404 }
1405 for (size_t i = 0; i < batch_size(); i++) {
1406 for (size_t oy = 0; oy < output_height(); oy++) {
1407 for (size_t ox = 0; ox < output_width(); ox++) {
1408 for (size_t ky = 0; ky < kernel_height(); ky++) {
1409 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1410 if (iy < input_height()) {
1411 for (size_t kx = 0; kx < kernel_width(); kx++) {
1412 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1413 if (ix < input_width()) {
1414 for (size_t g = 0; g < groups(); g++) {
1415 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1416 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1417 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1418 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1419 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1420 }
1421 }
1422 }
1423 }
1424 }
1425 }
1426 }
1427 }
1428 }
1429 }
1430
1431 // Compute renormalization parameters.
1432 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
1433 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
1434
1435 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
1436 const int8_t output_zero_point = int8_t(std::max(std::min(
1437 lrint(-0.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
1438 long(std::numeric_limits<int8_t>::max())), long(std::numeric_limits<int8_t>::min())));
1439
1440 // Renormalize reference results.
1441 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
1442 [this, output_scale, output_zero_point](int32_t x) -> double {
1443 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
1444 });
1445
1446 // Create, setup, and run Convolution operator once.
1447 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1448 xnn_operator_t convolution_op = nullptr;
1449
1450 xnn_status status = xnn_create_convolution2d_nhwc_qs8(
1451 padding_top(), padding_right(), padding_bottom(), padding_left(),
1452 kernel_height(), kernel_width(),
1453 subsampling_height(), subsampling_width(),
1454 dilation_height(), dilation_width(),
1455 groups(), group_input_channels(), group_output_channels(),
1456 input_channel_stride(), output_channel_stride(),
1457 input_zero_point, 1.0f /* input scale */, 1.0f /* kernel scale */,
1458 kernel.data(), has_bias() ? bias.data() : nullptr,
1459 output_zero_point, output_scale, int8_t(qmin() - 0x80), int8_t(qmax() - 0x80),
1460 0, &convolution_op);
1461 if (status == xnn_status_unsupported_hardware) {
1462 GTEST_SKIP();
1463 }
1464 ASSERT_EQ(xnn_status_success, status);
1465 ASSERT_NE(nullptr, convolution_op);
1466
1467 // Smart pointer to automatically delete convolution_op.
1468 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1469
1470 ASSERT_EQ(xnn_status_success,
1471 xnn_setup_convolution2d_nhwc_qs8(
1472 convolution_op,
1473 batch_size(), input_height(), input_width(),
1474 input.data(), output.data(),
1475 nullptr /* thread pool */));
1476
1477 ASSERT_EQ(xnn_status_success,
1478 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1479
1480 // Verify results of the first run.
1481 for (size_t i = 0; i < batch_size(); i++) {
1482 for (size_t y = 0; y < output_height(); y++) {
1483 for (size_t x = 0; x < output_width(); x++) {
1484 for (size_t g = 0; g < groups(); g++) {
1485 for (size_t c = 0; c < group_output_channels(); c++) {
1486 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
1487 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1488 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
1489 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1490 ASSERT_NEAR(
1491 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1492 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1493 0.9)
1494 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1495 }
1496 }
1497 }
1498 }
1499 }
1500
1501 // Re-generate data for the second run.
1502 std::generate(input.begin(), input.end(), std::ref(i8rng));
1503 std::fill(output.begin(), output.end(), 0xA5);
1504
1505 // Compute reference results for the second run, including renormalization.
1506 if (has_bias()) {
1507 for (size_t i = 0; i < next_batch_size(); i++) {
1508 for (size_t oy = 0; oy < next_output_height(); oy++) {
1509 for (size_t ox = 0; ox < next_output_width(); ox++) {
1510 for (size_t g = 0; g < groups(); g++) {
1511 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1512 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1513 bias[g * group_output_channels() + oc];
1514 }
1515 }
1516 }
1517 }
1518 }
1519 } else {
1520 std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
1521 }
1522 for (size_t i = 0; i < next_batch_size(); i++) {
1523 for (size_t oy = 0; oy < next_output_height(); oy++) {
1524 for (size_t ox = 0; ox < next_output_width(); ox++) {
1525 for (size_t ky = 0; ky < kernel_height(); ky++) {
1526 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1527 if (iy < next_input_height()) {
1528 for (size_t kx = 0; kx < kernel_width(); kx++) {
1529 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1530 if (ix < next_input_width()) {
1531 for (size_t g = 0; g < groups(); g++) {
1532 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1533 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1534 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1535 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
1536 int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1537 }
1538 }
1539 }
1540 }
1541 }
1542 }
1543 }
1544 }
1545 }
1546 }
1547 std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
1548 [this, output_scale, output_zero_point](int32_t x) -> double {
1549 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax() - 0x80) - output_zero_point), double(qmin() - 0x80) - output_zero_point);
1550 });
1551
1552 // Setup and run Convolution operator the second time, and destroy the operator.
1553 ASSERT_EQ(xnn_status_success,
1554 xnn_setup_convolution2d_nhwc_qs8(
1555 convolution_op,
1556 next_batch_size(), next_input_height(), next_input_width(),
1557 input.data(), output.data(),
1558 nullptr /* thread pool */));
1559
1560 ASSERT_EQ(xnn_status_success,
1561 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1562
1563 // Verify results of the second run.
1564 for (size_t i = 0; i < next_batch_size(); i++) {
1565 for (size_t y = 0; y < next_output_height(); y++) {
1566 for (size_t x = 0; x < next_output_width(); x++) {
1567 for (size_t g = 0; g < groups(); g++) {
1568 for (size_t c = 0; c < group_output_channels(); c++) {
1569 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax() - 0x80))
1570 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1571 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin() - 0x80))
1572 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1573 ASSERT_NEAR(
1574 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
1575 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
1576 0.9)
1577 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1578 }
1579 }
1580 }
1581 }
1582 }
1583 }
1584 }
1585
Marat Dukhan08b7a972020-07-14 18:17:29 -07001586 void TestSetupNHWCxQU8() const {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001587 ASSERT_FALSE(depthwise_layout());
1588
1589 std::random_device random_device;
1590 auto rng = std::mt19937(random_device());
Marat Dukhanecd83112020-08-03 21:50:28 -07001591 auto i32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
Marat Dukhan5ce30d92020-04-14 03:31:26 -07001592 auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001593
1594 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001595 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
1596 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())) + 8);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001597 std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1598 std::vector<int32_t> bias(groups() * group_output_channels());
1599 std::vector<uint8_t> output(std::max(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001600 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
1601 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
XNNPACK Teamb455b122019-09-27 18:10:33 -07001602 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1603 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1604 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1605 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1606
1607 const uint8_t input_zero_point = 127;
1608 const uint8_t kernel_zero_point = 127;
1609
1610 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1611 std::generate(input.begin(), input.end(), std::ref(u8rng));
1612 std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
Marat Dukhanecd83112020-08-03 21:50:28 -07001613 std::generate(bias.begin(), bias.end(), std::ref(i32rng));
XNNPACK Teamb455b122019-09-27 18:10:33 -07001614 std::fill(output.begin(), output.end(), 0xA5);
1615
1616 // Compute reference results, without renormalization.
Marat Dukhanf568f082019-10-30 09:47:07 -07001617 if (has_bias()) {
1618 for (size_t i = 0; i < batch_size(); i++) {
1619 for (size_t oy = 0; oy < output_height(); oy++) {
1620 for (size_t ox = 0; ox < output_width(); ox++) {
1621 for (size_t g = 0; g < groups(); g++) {
1622 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1623 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1624 bias[g * group_output_channels() + oc];
1625 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001626 }
1627 }
1628 }
1629 }
Marat Dukhanf568f082019-10-30 09:47:07 -07001630 } else {
1631 std::fill(accumulators.begin(), accumulators.end(), 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001632 }
1633 for (size_t i = 0; i < batch_size(); i++) {
1634 for (size_t oy = 0; oy < output_height(); oy++) {
1635 for (size_t ox = 0; ox < output_width(); ox++) {
1636 for (size_t ky = 0; ky < kernel_height(); ky++) {
1637 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1638 if (iy < input_height()) {
1639 for (size_t kx = 0; kx < kernel_width(); kx++) {
1640 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1641 if (ix < input_width()) {
1642 for (size_t g = 0; g < groups(); g++) {
1643 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1644 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1645 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001646 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
XNNPACK Teamb455b122019-09-27 18:10:33 -07001647 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
1648 }
1649 }
1650 }
1651 }
1652 }
1653 }
1654 }
1655 }
1656 }
1657 }
1658
1659 // Compute renormalization parameters.
1660 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
1661 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
1662
1663 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
1664 const uint8_t output_zero_point = uint8_t(std::max(std::min(
1665 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
1666 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
1667
1668 // Renormalize reference results.
1669 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
1670 [this, output_scale, output_zero_point](int32_t x) -> double {
1671 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
1672 });
1673
1674 // Create, setup, and run Convolution operator once.
Marat Dukhan04f03be2019-11-19 12:36:47 -08001675 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
XNNPACK Teamb455b122019-09-27 18:10:33 -07001676 xnn_operator_t convolution_op = nullptr;
1677
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001678 xnn_status status = xnn_create_convolution2d_nhwc_qu8(
XNNPACK Teamb455b122019-09-27 18:10:33 -07001679 padding_top(), padding_right(), padding_bottom(), padding_left(),
1680 kernel_height(), kernel_width(),
1681 subsampling_height(), subsampling_width(),
1682 dilation_height(), dilation_width(),
1683 groups(), group_input_channels(), group_output_channels(),
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001684 input_channel_stride(), output_channel_stride(),
XNNPACK Teamb455b122019-09-27 18:10:33 -07001685 input_zero_point, 1.0f /* input scale */,
1686 kernel_zero_point, 1.0f /* kernel scale */,
Marat Dukhanf568f082019-10-30 09:47:07 -07001687 kernel.data(), has_bias() ? bias.data() : nullptr,
XNNPACK Teamb455b122019-09-27 18:10:33 -07001688 output_zero_point, output_scale, qmin(), qmax(),
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07001689 0, &convolution_op);
1690 if (status == xnn_status_unsupported_hardware) {
1691 GTEST_SKIP();
1692 }
1693 ASSERT_EQ(xnn_status_success, status);
1694 ASSERT_NE(nullptr, convolution_op);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001695
1696 // Smart pointer to automatically delete convolution_op.
1697 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1698
1699 ASSERT_EQ(xnn_status_success,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001700 xnn_setup_convolution2d_nhwc_qu8(
XNNPACK Teamb455b122019-09-27 18:10:33 -07001701 convolution_op,
1702 batch_size(), input_height(), input_width(),
1703 input.data(), output.data(),
1704 nullptr /* thread pool */));
1705
1706 ASSERT_EQ(xnn_status_success,
1707 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1708
1709 // Verify results of the first run.
1710 for (size_t i = 0; i < batch_size(); i++) {
1711 for (size_t y = 0; y < output_height(); y++) {
1712 for (size_t x = 0; x < output_width(); x++) {
1713 for (size_t g = 0; g < groups(); g++) {
1714 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001715 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07001716 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001717 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07001718 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1719 ASSERT_NEAR(
1720 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001721 double(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
XNNPACK Teamb455b122019-09-27 18:10:33 -07001722 0.9)
1723 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1724 }
1725 }
1726 }
1727 }
1728 }
1729
1730 // Re-generate data for the second run.
1731 std::generate(input.begin(), input.end(), std::ref(u8rng));
1732 std::fill(output.begin(), output.end(), 0xA5);
1733
1734 // Compute reference results for the second run, including renormalization.
Marat Dukhanf568f082019-10-30 09:47:07 -07001735 if (has_bias()) {
1736 for (size_t i = 0; i < next_batch_size(); i++) {
1737 for (size_t oy = 0; oy < next_output_height(); oy++) {
1738 for (size_t ox = 0; ox < next_output_width(); ox++) {
1739 for (size_t g = 0; g < groups(); g++) {
1740 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1741 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1742 bias[g * group_output_channels() + oc];
1743 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07001744 }
1745 }
1746 }
1747 }
Marat Dukhanf568f082019-10-30 09:47:07 -07001748 } else {
1749 std::fill(next_accumulators.begin(), next_accumulators.end(), 0);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001750 }
1751 for (size_t i = 0; i < next_batch_size(); i++) {
1752 for (size_t oy = 0; oy < next_output_height(); oy++) {
1753 for (size_t ox = 0; ox < next_output_width(); ox++) {
1754 for (size_t ky = 0; ky < kernel_height(); ky++) {
1755 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1756 if (iy < next_input_height()) {
1757 for (size_t kx = 0; kx < kernel_width(); kx++) {
1758 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1759 if (ix < next_input_width()) {
1760 for (size_t g = 0; g < groups(); g++) {
1761 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1762 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1763 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001764 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
XNNPACK Teamb455b122019-09-27 18:10:33 -07001765 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
1766 }
1767 }
1768 }
1769 }
1770 }
1771 }
1772 }
1773 }
1774 }
1775 }
1776 std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
1777 [this, output_scale, output_zero_point](int32_t x) -> double {
1778 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
1779 });
1780
1781 // Setup and run Convolution operator the second time, and destroy the operator.
1782 ASSERT_EQ(xnn_status_success,
Marat Dukhan08b7a972020-07-14 18:17:29 -07001783 xnn_setup_convolution2d_nhwc_qu8(
XNNPACK Teamb455b122019-09-27 18:10:33 -07001784 convolution_op,
1785 next_batch_size(), next_input_height(), next_input_width(),
1786 input.data(), output.data(),
1787 nullptr /* thread pool */));
1788
1789 ASSERT_EQ(xnn_status_success,
1790 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1791
1792 // Verify results of the second run.
1793 for (size_t i = 0; i < next_batch_size(); i++) {
1794 for (size_t y = 0; y < next_output_height(); y++) {
1795 for (size_t x = 0; x < next_output_width(); x++) {
1796 for (size_t g = 0; g < groups(); g++) {
1797 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001798 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07001799 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001800 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
XNNPACK Teamb455b122019-09-27 18:10:33 -07001801 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1802 ASSERT_NEAR(
1803 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07001804 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
XNNPACK Teamb455b122019-09-27 18:10:33 -07001805 0.9)
1806 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1807 }
1808 }
1809 }
1810 }
1811 }
1812 }
1813 }
1814
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001815 void TestSetupNHWCxF16() const {
1816 ASSERT_FALSE(depthwise_layout());
1817
1818 std::random_device random_device;
1819 auto rng = std::mt19937(random_device());
Frank Barchard7d2c1f22020-09-14 16:43:53 -07001820 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001821 auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
1822
1823 std::vector<uint16_t> input(XNN_EXTRA_BYTES / sizeof(uint16_t) + std::max(
1824 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
1825 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
1826 std::vector<uint16_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
1827 std::vector<uint16_t> bias(groups() * group_output_channels());
1828 std::vector<uint16_t> output(std::max(
1829 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
1830 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
1831 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
1832 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
1833
1834 for (size_t iteration = 0; iteration < iterations(); iteration++) {
1835 std::generate(input.begin(), input.end(), std::ref(f16rng));
1836 std::generate(kernel.begin(), kernel.end(), std::ref(f16rng));
1837 std::generate(bias.begin(), bias.end(), std::ref(f16rng));
1838 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
1839
1840 // Compute reference results, without clamping.
1841 if (has_bias()) {
1842 for (size_t i = 0; i < batch_size(); i++) {
1843 for (size_t oy = 0; oy < output_height(); oy++) {
1844 for (size_t ox = 0; ox < output_width(); ox++) {
1845 for (size_t g = 0; g < groups(); g++) {
1846 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1847 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1848 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
1849 }
1850 }
1851 }
1852 }
1853 }
1854 } else {
1855 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
1856 }
1857 for (size_t i = 0; i < batch_size(); i++) {
1858 for (size_t oy = 0; oy < output_height(); oy++) {
1859 for (size_t ox = 0; ox < output_width(); ox++) {
1860 for (size_t ky = 0; ky < kernel_height(); ky++) {
1861 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1862 if (iy < input_height()) {
1863 for (size_t kx = 0; kx < kernel_width(); kx++) {
1864 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1865 if (ix < input_width()) {
1866 for (size_t g = 0; g < groups(); g++) {
1867 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1868 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1869 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1870 fp16_ieee_to_fp32_value(input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) *
1871 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1872 }
1873 }
1874 }
1875 }
1876 }
1877 }
1878 }
1879 }
1880 }
1881 }
1882
1883 // Compute clamping parameters.
1884 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1885 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1886 const float accumulated_range = accumulated_max - accumulated_min;
1887 const float scaled_min = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_min + accumulated_range / 255.0f * float(qmin())));
1888 const float scaled_max = fp16_ieee_to_fp32_value(fp16_ieee_from_fp32_value(accumulated_max - accumulated_range / 255.0f * float(255 - qmax())));
1889 const float output_min = scaled_min == scaled_max ? -std::numeric_limits<float>::infinity() : scaled_min;
1890 const float output_max = scaled_min == scaled_max ? +std::numeric_limits<float>::infinity() : scaled_max;
1891
1892 for (float& output_value : output_ref) {
1893 output_value = std::min(std::max(output_value, output_min), output_max);
1894 }
1895
1896 // Create, setup, and run Convolution operator once.
1897 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
1898 xnn_operator_t convolution_op = nullptr;
1899
1900 xnn_status status = xnn_create_convolution2d_nhwc_f16(
1901 padding_top(), padding_right(), padding_bottom(), padding_left(),
1902 kernel_height(), kernel_width(),
1903 subsampling_height(), subsampling_width(),
1904 dilation_height(), dilation_width(),
1905 groups(), group_input_channels(), group_output_channels(),
1906 input_channel_stride(), output_channel_stride(),
1907 kernel.data(), has_bias() ? bias.data() : nullptr,
1908 output_min, output_max,
1909 0, &convolution_op);
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001910 if (status == xnn_status_unsupported_hardware) {
1911 GTEST_SKIP();
1912 }
1913 ASSERT_EQ(xnn_status_success, status);
1914 ASSERT_NE(nullptr, convolution_op);
1915
1916 // Smart pointer to automatically delete convolution_op.
1917 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1918
1919 ASSERT_EQ(xnn_status_success,
1920 xnn_setup_convolution2d_nhwc_f16(
1921 convolution_op,
1922 batch_size(), input_height(), input_width(),
1923 input.data(), output.data(),
1924 nullptr /* thread pool */));
1925
1926 ASSERT_EQ(xnn_status_success,
1927 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1928
1929 // Verify results of the first run.
1930 for (size_t i = 0; i < batch_size(); i++) {
1931 for (size_t y = 0; y < output_height(); y++) {
1932 for (size_t x = 0; x < output_width(); x++) {
1933 for (size_t g = 0; g < groups(); g++) {
1934 for (size_t c = 0; c < group_output_channels(); c++) {
1935 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min)
1936 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1937 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max)
1938 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Frank Barchard2b9d29b2020-09-17 12:03:39 -07001939 ASSERT_NEAR(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c], fp16_ieee_to_fp32_value(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), std::max(1.0e-4f, std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]) * 1.0e-2f))
Frank Barchard49b4dcc2020-06-26 14:07:19 -07001940 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1941 }
1942 }
1943 }
1944 }
1945 }
1946
1947 // Re-generate data for the second run.
1948 std::generate(input.begin(), input.end(), std::ref(f16rng));
1949 std::fill(output.begin(), output.end(), UINT16_C(0x7E00) /* NaN */);
1950
1951 // Compute reference results for the second run, including clamping.
1952 if (has_bias()) {
1953 for (size_t i = 0; i < next_batch_size(); i++) {
1954 for (size_t oy = 0; oy < next_output_height(); oy++) {
1955 for (size_t ox = 0; ox < next_output_width(); ox++) {
1956 for (size_t g = 0; g < groups(); g++) {
1957 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1958 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1959 fp16_ieee_to_fp32_value(bias[g * group_output_channels() + oc]);
1960 }
1961 }
1962 }
1963 }
1964 }
1965 } else {
1966 std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f);
1967 }
1968 for (size_t i = 0; i < next_batch_size(); i++) {
1969 for (size_t oy = 0; oy < next_output_height(); oy++) {
1970 for (size_t ox = 0; ox < next_output_width(); ox++) {
1971 for (size_t ky = 0; ky < kernel_height(); ky++) {
1972 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1973 if (iy < next_input_height()) {
1974 for (size_t kx = 0; kx < kernel_width(); kx++) {
1975 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1976 if (ix < next_input_width()) {
1977 for (size_t g = 0; g < groups(); g++) {
1978 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1979 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1980 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1981 fp16_ieee_to_fp32_value(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic]) *
1982 fp16_ieee_to_fp32_value(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]);
1983 }
1984 }
1985 }
1986 }
1987 }
1988 }
1989 }
1990 }
1991 }
1992 }
1993 for (float& value : next_output_ref) {
1994 value = std::max(std::min(value, output_max), output_min);
1995 }
1996
1997 // Setup and run Convolution operator the second time, and destroy the operator.
1998 ASSERT_EQ(xnn_status_success,
1999 xnn_setup_convolution2d_nhwc_f16(
2000 convolution_op,
2001 next_batch_size(), next_input_height(), next_input_width(),
2002 input.data(), output.data(),
2003 nullptr /* thread pool */));
2004
2005 ASSERT_EQ(xnn_status_success,
2006 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2007
2008 // Verify results of the second run.
2009 for (size_t i = 0; i < next_batch_size(); i++) {
2010 for (size_t y = 0; y < next_output_height(); y++) {
2011 for (size_t x = 0; x < next_output_width(); x++) {
2012 for (size_t g = 0; g < groups(); g++) {
2013 for (size_t c = 0; c < group_output_channels(); c++) {
2014 ASSERT_GE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_min)
2015 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2016 ASSERT_LE(fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), output_max)
2017 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Frank Barchard2b9d29b2020-09-17 12:03:39 -07002018 ASSERT_NEAR(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c], fp16_ieee_to_fp32_value(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c]), std::max(1.0e-4f, std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]) * 1.0e-2f))
Frank Barchard49b4dcc2020-06-26 14:07:19 -07002019 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2020 }
2021 }
2022 }
2023 }
2024 }
2025 }
2026 }
2027
Marat Dukhanefc47b82019-11-18 09:25:38 -08002028 void TestSetupNHWCxF32() const {
XNNPACK Teamb455b122019-09-27 18:10:33 -07002029 ASSERT_FALSE(depthwise_layout());
2030
2031 std::random_device random_device;
2032 auto rng = std::mt19937(random_device());
2033 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
2034
2035 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + std::max(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002036 batch_size() * ((input_height() * input_width() - 1) * input_channel_stride() + groups() * group_input_channels()),
2037 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_channel_stride() + groups() * group_input_channels())));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002038 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
2039 std::vector<float> bias(groups() * group_output_channels());
2040 std::vector<float> output(std::max(
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002041 batch_size() * ((output_height() * output_width() - 1) * output_channel_stride() + groups() * group_output_channels()),
2042 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_channel_stride() + groups() * group_output_channels())));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002043 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
2044 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
2045
2046 for (size_t iteration = 0; iteration < iterations(); iteration++) {
2047 std::generate(input.begin(), input.end(), std::ref(f32rng));
2048 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
2049 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
2050 std::fill(output.begin(), output.end(), nanf(""));
2051
2052 // Compute reference results, without clamping.
Marat Dukhanf568f082019-10-30 09:47:07 -07002053 if (has_bias()) {
2054 for (size_t i = 0; i < batch_size(); i++) {
2055 for (size_t oy = 0; oy < output_height(); oy++) {
2056 for (size_t ox = 0; ox < output_width(); ox++) {
2057 for (size_t g = 0; g < groups(); g++) {
2058 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2059 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2060 bias[g * group_output_channels() + oc];
2061 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07002062 }
2063 }
2064 }
2065 }
Marat Dukhanf568f082019-10-30 09:47:07 -07002066 } else {
2067 std::fill(output_ref.begin(), output_ref.end(), 0.0f);
XNNPACK Teamb455b122019-09-27 18:10:33 -07002068 }
2069 for (size_t i = 0; i < batch_size(); i++) {
2070 for (size_t oy = 0; oy < output_height(); oy++) {
2071 for (size_t ox = 0; ox < output_width(); ox++) {
2072 for (size_t ky = 0; ky < kernel_height(); ky++) {
2073 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2074 if (iy < input_height()) {
2075 for (size_t kx = 0; kx < kernel_width(); kx++) {
2076 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2077 if (ix < input_width()) {
2078 for (size_t g = 0; g < groups(); g++) {
2079 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2080 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2081 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002082 input[((i * input_height() + iy) * input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] *
XNNPACK Teamb455b122019-09-27 18:10:33 -07002083 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
2084 }
2085 }
2086 }
2087 }
2088 }
2089 }
2090 }
2091 }
2092 }
2093 }
2094
2095 // Compute clamping parameters.
2096 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
2097 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
2098
2099 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
2100 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
2101
2102 // Clamp reference results.
2103 for (float& value : output_ref) {
2104 value = std::max(std::min(value, output_max), output_min);
2105 }
2106
2107 // Create, setup, and run Convolution operator once.
Marat Dukhan04f03be2019-11-19 12:36:47 -08002108 ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
XNNPACK Teamb455b122019-09-27 18:10:33 -07002109 xnn_operator_t convolution_op = nullptr;
2110
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002111 xnn_status status = xnn_create_convolution2d_nhwc_f32(
XNNPACK Teamb455b122019-09-27 18:10:33 -07002112 padding_top(), padding_right(), padding_bottom(), padding_left(),
2113 kernel_height(), kernel_width(),
2114 subsampling_height(), subsampling_width(),
2115 dilation_height(), dilation_width(),
2116 groups(), group_input_channels(), group_output_channels(),
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002117 input_channel_stride(), output_channel_stride(),
Marat Dukhanf568f082019-10-30 09:47:07 -07002118 kernel.data(), has_bias() ? bias.data() : nullptr,
XNNPACK Teamb455b122019-09-27 18:10:33 -07002119 output_min, output_max,
Marat Dukhan16f1e1a2020-08-04 16:38:22 -07002120 0, &convolution_op);
2121 if (status == xnn_status_unsupported_hardware) {
2122 GTEST_SKIP();
2123 }
2124 ASSERT_EQ(xnn_status_success, status);
2125 ASSERT_NE(nullptr, convolution_op);
XNNPACK Teamb455b122019-09-27 18:10:33 -07002126
2127 // Smart pointer to automatically delete convolution_op.
2128 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
2129
2130 ASSERT_EQ(xnn_status_success,
2131 xnn_setup_convolution2d_nhwc_f32(
2132 convolution_op,
2133 batch_size(), input_height(), input_width(),
2134 input.data(), output.data(),
2135 nullptr /* thread pool */));
2136
2137 ASSERT_EQ(xnn_status_success,
2138 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2139
2140 // Verify results of the first run.
2141 for (size_t i = 0; i < batch_size(); i++) {
2142 for (size_t y = 0; y < output_height(); y++) {
2143 for (size_t x = 0; x < output_width(); x++) {
2144 for (size_t g = 0; g < groups(); g++) {
2145 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002146 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min)
XNNPACK Teamb455b122019-09-27 18:10:33 -07002147 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002148 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max)
XNNPACK Teamb455b122019-09-27 18:10:33 -07002149 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2150 ASSERT_NEAR(
2151 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002152 output[((i * output_height() + y) * output_width() + x) * output_channel_stride() + g * group_output_channels() + c],
XNNPACK Teamb455b122019-09-27 18:10:33 -07002153 1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
2154 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2155 }
2156 }
2157 }
2158 }
2159 }
2160
2161 // Re-generate data for the second run.
2162 std::generate(input.begin(), input.end(), std::ref(f32rng));
2163 std::fill(output.begin(), output.end(), nanf(""));
2164
2165 // Compute reference results for the second run, including clamping.
Marat Dukhanf568f082019-10-30 09:47:07 -07002166 if (has_bias()) {
2167 for (size_t i = 0; i < next_batch_size(); i++) {
2168 for (size_t oy = 0; oy < next_output_height(); oy++) {
2169 for (size_t ox = 0; ox < next_output_width(); ox++) {
2170 for (size_t g = 0; g < groups(); g++) {
2171 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2172 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
2173 bias[g * group_output_channels() + oc];
2174 }
XNNPACK Teamb455b122019-09-27 18:10:33 -07002175 }
2176 }
2177 }
2178 }
Marat Dukhanf568f082019-10-30 09:47:07 -07002179 } else {
2180 std::fill(next_output_ref.begin(), next_output_ref.end(), 0.0f);
XNNPACK Teamb455b122019-09-27 18:10:33 -07002181 }
2182 for (size_t i = 0; i < next_batch_size(); i++) {
2183 for (size_t oy = 0; oy < next_output_height(); oy++) {
2184 for (size_t ox = 0; ox < next_output_width(); ox++) {
2185 for (size_t ky = 0; ky < kernel_height(); ky++) {
2186 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
2187 if (iy < next_input_height()) {
2188 for (size_t kx = 0; kx < kernel_width(); kx++) {
2189 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
2190 if (ix < next_input_width()) {
2191 for (size_t g = 0; g < groups(); g++) {
2192 for (size_t oc = 0; oc < group_output_channels(); oc++) {
2193 for (size_t ic = 0; ic < group_input_channels(); ic++) {
2194 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002195 input[((i * next_input_height() + iy) * next_input_width() + ix) * input_channel_stride() + g * group_input_channels() + ic] *
XNNPACK Teamb455b122019-09-27 18:10:33 -07002196 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
2197 }
2198 }
2199 }
2200 }
2201 }
2202 }
2203 }
2204 }
2205 }
2206 }
2207 for (float& value : next_output_ref) {
2208 value = std::max(std::min(value, output_max), output_min);
2209 }
2210
2211 // Setup and run Convolution operator the second time, and destroy the operator.
2212 ASSERT_EQ(xnn_status_success,
2213 xnn_setup_convolution2d_nhwc_f32(
2214 convolution_op,
2215 next_batch_size(), next_input_height(), next_input_width(),
2216 input.data(), output.data(),
2217 nullptr /* thread pool */));
2218
2219 ASSERT_EQ(xnn_status_success,
2220 xnn_run_operator(convolution_op, nullptr /* thread pool */));
2221
2222 // Verify results of the second run.
2223 for (size_t i = 0; i < next_batch_size(); i++) {
2224 for (size_t y = 0; y < next_output_height(); y++) {
2225 for (size_t x = 0; x < next_output_width(); x++) {
2226 for (size_t g = 0; g < groups(); g++) {
2227 for (size_t c = 0; c < group_output_channels(); c++) {
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002228 ASSERT_GE(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_min)
XNNPACK Teamb455b122019-09-27 18:10:33 -07002229 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002230 ASSERT_LE(output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c], output_max)
XNNPACK Teamb455b122019-09-27 18:10:33 -07002231 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2232 ASSERT_NEAR(
2233 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002234 output[((i * next_output_height() + y) * next_output_width() + x) * output_channel_stride() + g * group_output_channels() + c],
XNNPACK Teamb455b122019-09-27 18:10:33 -07002235 1.0e-4 * std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]))
2236 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
2237 }
2238 }
2239 }
2240 }
2241 }
2242 }
2243 }
2244
2245 private:
2246 uint32_t padding_top_{0};
2247 uint32_t padding_right_{0};
2248 uint32_t padding_bottom_{0};
2249 uint32_t padding_left_{0};
Marat Dukhan8440fde2019-10-24 12:46:13 -07002250 bool padding_tf_same_{false};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002251 size_t input_height_{1};
2252 size_t input_width_{1};
2253 uint32_t groups_{1};
2254 size_t group_input_channels_{1};
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002255 size_t input_channel_stride_{0};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002256 size_t group_output_channels_{1};
Marat Dukhanc3d52cf2020-06-18 07:56:25 -07002257 size_t output_channel_stride_{0};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002258 size_t batch_size_{1};
2259 uint32_t kernel_height_{1};
2260 uint32_t kernel_width_{1};
2261 uint32_t dilation_height_{1};
2262 uint32_t dilation_width_{1};
2263 uint32_t subsampling_height_{1};
2264 uint32_t subsampling_width_{1};
2265 size_t next_input_height_{0};
2266 size_t next_input_width_{0};
2267 size_t next_batch_size_{0};
Marat Dukhanefc47b82019-11-18 09:25:38 -08002268 float sparsity_{0.0f};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002269 uint8_t qmin_{0};
2270 uint8_t qmax_{255};
2271 bool depthwise_layout_{false};
Marat Dukhanefc47b82019-11-18 09:25:38 -08002272 bool force_nhwc_input_{false};
Marat Dukhanf568f082019-10-30 09:47:07 -07002273 bool has_bias_{true};
XNNPACK Teamb455b122019-09-27 18:10:33 -07002274 size_t iterations_{1};
2275};