blob: 92c08c446f47f2661b5fd8a554a6aedee1737ea0 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#pragma once
10
11#include <gtest/gtest.h>
12
13#include <algorithm>
14#include <cassert>
15#include <cmath>
16#include <cstddef>
17#include <cstdlib>
18#include <functional>
19#include <random>
20#include <vector>
21
22#include <xnnpack.h>
23
24
25class ConvolutionOperatorTester {
26 public:
27 inline ConvolutionOperatorTester& padding(uint32_t padding) {
28 this->padding_top_ = padding;
29 this->padding_right_ = padding;
30 this->padding_bottom_ = padding;
31 this->padding_left_ = padding;
32 return *this;
33 }
34
35 inline ConvolutionOperatorTester& padding(uint32_t padding_height, uint32_t padding_width) {
36 this->padding_top_ = padding_height;
37 this->padding_right_ = padding_width;
38 this->padding_bottom_ = padding_height;
39 this->padding_left_ = padding_width;
40 return *this;
41 }
42
43 inline ConvolutionOperatorTester& padding_height(uint32_t padding_height) {
44 this->padding_top_ = padding_height;
45 this->padding_bottom_ = padding_height;
46 return *this;
47 }
48
49 inline ConvolutionOperatorTester& padding_width(uint32_t padding_width) {
50 this->padding_right_ = padding_width;
51 this->padding_left_ = padding_width;
52 return *this;
53 }
54
55 inline ConvolutionOperatorTester& padding_top(uint32_t padding_top) {
56 this->padding_top_ = padding_top;
57 return *this;
58 }
59
60 inline uint32_t padding_top() const {
61 return this->padding_top_;
62 }
63
64 inline ConvolutionOperatorTester& padding_right(uint32_t padding_right) {
65 this->padding_right_ = padding_right;
66 return *this;
67 }
68
69 inline uint32_t padding_right() const {
70 return this->padding_right_;
71 }
72
73 inline ConvolutionOperatorTester& padding_bottom(uint32_t padding_bottom) {
74 this->padding_bottom_ = padding_bottom;
75 return *this;
76 }
77
78 inline uint32_t padding_bottom() const {
79 return this->padding_bottom_;
80 }
81
82 inline ConvolutionOperatorTester& padding_left(uint32_t padding_left) {
83 this->padding_left_ = padding_left;
84 return *this;
85 }
86
87 inline uint32_t padding_left() const {
88 return this->padding_left_;
89 }
90
91 inline ConvolutionOperatorTester& input_size(uint32_t input_height, uint32_t input_width) {
92 assert(input_height >= 1);
93 assert(input_width >= 1);
94 this->input_height_ = input_height;
95 this->input_width_ = input_width;
96 return *this;
97 }
98
99 inline ConvolutionOperatorTester& input_height(uint32_t input_height) {
100 assert(input_height >= 1);
101 this->input_height_ = input_height;
102 return *this;
103 }
104
105 inline uint32_t input_height() const {
106 return this->input_height_;
107 }
108
109 inline ConvolutionOperatorTester& input_width(uint32_t input_width) {
110 assert(input_width >= 1);
111 this->input_width_ = input_width;
112 return *this;
113 }
114
115 inline uint32_t input_width() const {
116 return this->input_width_;
117 }
118
119 inline ConvolutionOperatorTester& groups(uint32_t groups) {
120 assert(groups >= 1);
121 this->groups_ = groups;
122 return *this;
123 }
124
125 inline uint32_t groups() const {
126 return this->groups_;
127 }
128
129 inline ConvolutionOperatorTester& group_input_channels(size_t group_input_channels) {
130 assert(group_input_channels >= 1);
131 this->group_input_channels_ = group_input_channels;
132 return *this;
133 }
134
135 inline size_t group_input_channels() const {
136 return this->group_input_channels_;
137 }
138
139 inline ConvolutionOperatorTester& group_output_channels(size_t group_output_channels) {
140 assert(group_output_channels >= 1);
141 this->group_output_channels_ = group_output_channels;
142 return *this;
143 }
144
145 inline size_t group_output_channels() const {
146 return this->group_output_channels_;
147 }
148
149 inline ConvolutionOperatorTester& batch_size(size_t batch_size) {
150 assert(batch_size >= 1);
151 this->batch_size_ = batch_size;
152 return *this;
153 }
154
155 inline size_t batch_size() const {
156 return this->batch_size_;
157 }
158
159 inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_size) {
160 assert(kernel_size >= 1);
161 this->kernel_height_ = kernel_size;
162 this->kernel_width_ = kernel_size;
163 return *this;
164 }
165
166 inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_height, uint32_t kernel_width) {
167 assert(kernel_height >= 1);
168 assert(kernel_width >= 1);
169 this->kernel_height_ = kernel_height;
170 this->kernel_width_ = kernel_width;
171 return *this;
172 }
173
174 inline ConvolutionOperatorTester& kernel_height(uint32_t kernel_height) {
175 assert(kernel_height >= 1);
176 this->kernel_height_ = kernel_height;
177 return *this;
178 }
179
180 inline uint32_t kernel_height() const {
181 return this->kernel_height_;
182 }
183
184 inline ConvolutionOperatorTester& kernel_width(uint32_t kernel_width) {
185 assert(kernel_width >= 1);
186 this->kernel_width_ = kernel_width;
187 return *this;
188 }
189
190 inline uint32_t kernel_width() const {
191 return this->kernel_width_;
192 }
193
194 inline ConvolutionOperatorTester& dilation(uint32_t dilation) {
195 assert(dilation >= 1);
196 this->dilation_height_ = dilation;
197 this->dilation_width_ = dilation;
198 return *this;
199 }
200
201 inline ConvolutionOperatorTester& dilation(uint32_t dilation_height, uint32_t dilation_width) {
202 assert(dilation_height >= 1);
203 assert(dilation_width >= 1);
204 this->dilation_height_ = dilation_height;
205 this->dilation_width_ = dilation_width;
206 return *this;
207 }
208
209 inline ConvolutionOperatorTester& dilation_height(uint32_t dilation_height) {
210 assert(dilation_height >= 1);
211 this->dilation_height_ = dilation_height;
212 return *this;
213 }
214
215 inline uint32_t dilation_height() const {
216 return this->dilation_height_;
217 }
218
219 inline ConvolutionOperatorTester& dilation_width(uint32_t dilation_width) {
220 assert(dilation_width >= 1);
221 this->dilation_width_ = dilation_width;
222 return *this;
223 }
224
225 inline uint32_t dilation_width() const {
226 return this->dilation_width_;
227 }
228
229 inline ConvolutionOperatorTester& subsampling(uint32_t subsampling) {
230 assert(subsampling >= 1);
231 this->subsampling_height_ = subsampling;
232 this->subsampling_width_ = subsampling;
233 return *this;
234 }
235
236 inline ConvolutionOperatorTester& subsampling(uint32_t subsampling_height, uint32_t subsampling_width) {
237 assert(subsampling_height >= 1);
238 assert(subsampling_width >= 1);
239 this->subsampling_height_ = subsampling_height;
240 this->subsampling_width_ = subsampling_width;
241 return *this;
242 }
243
244 inline ConvolutionOperatorTester& subsampling_height(uint32_t subsampling_height) {
245 assert(subsampling_height >= 1);
246 this->subsampling_height_ = subsampling_height;
247 return *this;
248 }
249
250 inline uint32_t subsampling_height() const {
251 return this->subsampling_height_;
252 }
253
254 inline ConvolutionOperatorTester& subsampling_width(uint32_t subsampling_width) {
255 assert(subsampling_width >= 1);
256 this->subsampling_width_ = subsampling_width;
257 return *this;
258 }
259
260 inline uint32_t subsampling_width() const {
261 return this->subsampling_width_;
262 }
263
264 inline ConvolutionOperatorTester& input_pixel_stride(size_t input_pixel_stride) {
265 assert(input_pixel_stride >= 1);
266 this->input_pixel_stride_ = input_pixel_stride;
267 return *this;
268 }
269
270 inline size_t input_pixel_stride() const {
271 if (this->input_pixel_stride_ == 0) {
272 return group_input_channels() * groups();
273 } else {
274 assert(this->input_pixel_stride_ >= group_input_channels() * groups());
275 return this->input_pixel_stride_;
276 }
277 }
278
279 inline ConvolutionOperatorTester& output_pixel_stride(size_t output_pixel_stride) {
280 assert(output_pixel_stride >= 1);
281 this->output_pixel_stride_ = output_pixel_stride;
282 return *this;
283 }
284
285 inline size_t output_pixel_stride() const {
286 if (this->output_pixel_stride_ == 0) {
287 return group_output_channels() * groups();
288 } else {
289 assert(this->output_pixel_stride_ >= group_output_channels() * groups());
290 return this->output_pixel_stride_;
291 }
292 }
293
294 inline uint32_t dilated_kernel_height() const {
295 return (kernel_height() - 1) * dilation_height() + 1;
296 }
297
298 inline uint32_t dilated_kernel_width() const {
299 return (kernel_width() - 1) * dilation_width() + 1;
300 }
301
302 inline size_t output_height() const {
303 const size_t padded_input_height = padding_top() + input_height() + padding_bottom();
304 if (padded_input_height <= dilated_kernel_height()) {
305 return 1;
306 } else {
307 return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1;
308 }
309 }
310
311 inline size_t output_width() const {
312 const size_t padded_input_width = padding_left() + input_width() + padding_right();
313 if (padded_input_width <= dilated_kernel_width()) {
314 return 1;
315 } else {
316 return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1;
317 }
318 }
319
320 inline ConvolutionOperatorTester& next_input_size(uint32_t next_input_height, uint32_t next_input_width) {
321 assert(next_input_height >= 1);
322 assert(next_input_width >= 1);
323 this->next_input_height_ = next_input_height;
324 this->next_input_width_ = next_input_width;
325 return *this;
326 }
327
328 inline ConvolutionOperatorTester& next_input_height(uint32_t next_input_height) {
329 assert(next_input_height >= 1);
330 this->next_input_height_ = next_input_height;
331 return *this;
332 }
333
334 inline uint32_t next_input_height() const {
335 if (this->next_input_height_ == 0) {
336 return input_height();
337 } else {
338 return this->next_input_height_;
339 }
340 }
341
342 inline ConvolutionOperatorTester& next_input_width(uint32_t next_input_width) {
343 assert(next_input_width >= 1);
344 this->next_input_width_ = next_input_width;
345 return *this;
346 }
347
348 inline uint32_t next_input_width() const {
349 if (this->next_input_width_ == 0) {
350 return input_width();
351 } else {
352 return this->next_input_width_;
353 }
354 }
355
356 inline size_t next_output_height() const {
357 const size_t padded_input_height = padding_top() + next_input_height() + padding_bottom();
358 if (padded_input_height <= dilated_kernel_height()) {
359 return 1;
360 } else {
361 return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1;
362 }
363 }
364
365 inline size_t next_output_width() const {
366 const size_t padded_input_width = padding_left() + next_input_width() + padding_right();
367 if (padded_input_width <= dilated_kernel_width()) {
368 return 1;
369 } else {
370 return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1;
371 }
372 }
373
374 inline ConvolutionOperatorTester& next_batch_size(size_t next_batch_size) {
375 assert(next_batch_size >= 1);
376 this->next_batch_size_ = next_batch_size;
377 return *this;
378 }
379
380 inline size_t next_batch_size() const {
381 if (this->next_batch_size_ == 0) {
382 return batch_size();
383 } else {
384 return this->next_batch_size_;
385 }
386 }
387
388 inline ConvolutionOperatorTester& qmin(uint8_t qmin) {
389 this->qmin_ = qmin;
390 return *this;
391 }
392
393 inline uint8_t qmin() const {
394 return this->qmin_;
395 }
396
397 inline ConvolutionOperatorTester& qmax(uint8_t qmax) {
398 this->qmax_ = qmax;
399 return *this;
400 }
401
402 inline uint8_t qmax() const {
403 return this->qmax_;
404 }
405
406 inline ConvolutionOperatorTester& depthwise_layout(bool depthwise_layout) {
407 this->depthwise_layout_ = depthwise_layout;
408 return *this;
409 }
410
411 inline bool depthwise_layout() const {
412 return this->depthwise_layout_;
413 }
414
415 inline ConvolutionOperatorTester& iterations(size_t iterations) {
416 this->iterations_ = iterations;
417 return *this;
418 }
419
420 inline size_t iterations() const {
421 return this->iterations_;
422 }
423
424 void TestQ8() const {
425 std::random_device random_device;
426 auto rng = std::mt19937(random_device());
427 auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
428 auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
429
430 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
431 batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()) + 8);
432 std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
433 std::vector<int32_t> bias(groups() * group_output_channels());
434 std::vector<uint8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()));
435 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
436 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
437
438 const uint8_t input_zero_point = 127;
439 const uint8_t kernel_zero_point = 127;
440
441 for (size_t iteration = 0; iteration < iterations(); iteration++) {
442 std::generate(input.begin(), input.end(), std::ref(u8rng));
443 std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
444 std::generate(bias.begin(), bias.end(), std::ref(s32rng));
445 std::fill(output.begin(), output.end(), 0xA5);
446
447 // Compute reference results, without renormalization.
448 for (size_t i = 0; i < batch_size(); i++) {
449 for (size_t oy = 0; oy < output_height(); oy++) {
450 for (size_t ox = 0; ox < output_width(); ox++) {
451 for (size_t g = 0; g < groups(); g++) {
452 for (size_t oc = 0; oc < group_output_channels(); oc++) {
453 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
454 bias[g * group_output_channels() + oc];
455 }
456 }
457 }
458 }
459 }
460 if (depthwise_layout()) {
461 ASSERT_EQ(group_input_channels(), 1);
462
463 for (size_t i = 0; i < batch_size(); i++) {
464 for (size_t oy = 0; oy < output_height(); oy++) {
465 for (size_t ox = 0; ox < output_width(); ox++) {
466 for (size_t ky = 0; ky < kernel_height(); ky++) {
467 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
468 if (iy < input_height()) {
469 for (size_t kx = 0; kx < kernel_width(); kx++) {
470 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
471 if (ix < input_width()) {
472 for (size_t g = 0; g < groups(); g++) {
473 for (size_t oc = 0; oc < group_output_channels(); oc++) {
474 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
475 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g]) - int32_t(input_zero_point)) *
476 (int32_t(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]) - int32_t(kernel_zero_point));
477 }
478 }
479 }
480 }
481 }
482 }
483 }
484 }
485 }
486 } else {
487 for (size_t i = 0; i < batch_size(); i++) {
488 for (size_t oy = 0; oy < output_height(); oy++) {
489 for (size_t ox = 0; ox < output_width(); ox++) {
490 for (size_t ky = 0; ky < kernel_height(); ky++) {
491 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
492 if (iy < input_height()) {
493 for (size_t kx = 0; kx < kernel_width(); kx++) {
494 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
495 if (ix < input_width()) {
496 for (size_t g = 0; g < groups(); g++) {
497 for (size_t oc = 0; oc < group_output_channels(); oc++) {
498 for (size_t ic = 0; ic < group_input_channels(); ic++) {
499 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
500 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
501 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
502 }
503 }
504 }
505 }
506 }
507 }
508 }
509 }
510 }
511 }
512 }
513
514 // Compute renormalization parameters.
515 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
516 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
517
518 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
519 const uint8_t output_zero_point = uint8_t(std::max(std::min(
520 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
521 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
522
523 // Renormalize reference results.
524 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
525 [this, output_scale, output_zero_point](int32_t x) -> double {
526 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
527 });
528
529 // Create, setup, run, and destroy Convolution operator.
530 ASSERT_EQ(xnn_status_success, xnn_initialize());
531 xnn_operator_t convolution_op = nullptr;
532
533 ASSERT_EQ(xnn_status_success,
534 xnn_create_convolution2d_nhwc_q8(
535 padding_top(), padding_right(), padding_bottom(), padding_left(),
536 kernel_height(), kernel_width(),
537 subsampling_height(), subsampling_width(),
538 dilation_height(), dilation_width(),
539 groups(), group_input_channels(), group_output_channels(),
540 input_pixel_stride(), output_pixel_stride(),
541 input_zero_point, 1.0f /* input scale */,
542 kernel_zero_point, 1.0f /* kernel scale */,
543 kernel.data(), bias.data(),
544 output_zero_point, output_scale, qmin(), qmax(),
545 depthwise_layout() ? XNN_CONVOLUTION_FLAG_DEPTHWISE : 0,
546 &convolution_op));
547
548 // Smart pointer to automatically delete convolution_op.
549 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
550
551 ASSERT_EQ(xnn_status_success,
552 xnn_setup_convolution2d_nhwc_q8(
553 convolution_op,
554 batch_size(), input_height(), input_width(),
555 input.data(), output.data(),
556 nullptr /* thread pool */));
557
558 ASSERT_EQ(xnn_status_success,
559 xnn_run_operator(convolution_op, nullptr /* thread pool */));
560
561 // Verify results.
562 for (size_t i = 0; i < batch_size(); i++) {
563 for (size_t y = 0; y < output_height(); y++) {
564 for (size_t x = 0; x < output_width(); x++) {
565 for (size_t g = 0; g < groups(); g++) {
566 for (size_t c = 0; c < group_output_channels(); c++) {
567 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
568 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
569 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
570 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
571 ASSERT_NEAR(
572 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
573 double(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
574 0.9)
575 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
576 }
577 }
578 }
579 }
580 }
581 }
582 }
583
584 void TestF32() const {
585 std::random_device random_device;
586 auto rng = std::mt19937(random_device());
587 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
588
589 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
590 batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()));
591 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
592 std::vector<float> bias(groups() * group_output_channels());
593 std::vector<float> output(batch_size() * ((output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()));
594 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
595
596 for (size_t iteration = 0; iteration < iterations(); iteration++) {
597 std::generate(input.begin(), input.end(), std::ref(f32rng));
598 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
599 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
600 std::fill(output.begin(), output.end(), nanf(""));
601
602 // Compute reference results, without clamping.
603 for (size_t i = 0; i < batch_size(); i++) {
604 for (size_t oy = 0; oy < output_height(); oy++) {
605 for (size_t ox = 0; ox < output_width(); ox++) {
606 for (size_t g = 0; g < groups(); g++) {
607 for (size_t oc = 0; oc < group_output_channels(); oc++) {
608 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
609 bias[g * group_output_channels() + oc];
610 }
611 }
612 }
613 }
614 }
615 if (depthwise_layout()) {
616 ASSERT_EQ(group_input_channels(), 1);
617
618 for (size_t i = 0; i < batch_size(); i++) {
619 for (size_t oy = 0; oy < output_height(); oy++) {
620 for (size_t ox = 0; ox < output_width(); ox++) {
621 for (size_t ky = 0; ky < kernel_height(); ky++) {
622 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
623 if (iy < input_height()) {
624 for (size_t kx = 0; kx < kernel_width(); kx++) {
625 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
626 if (ix < input_width()) {
627 for (size_t g = 0; g < groups(); g++) {
628 for (size_t oc = 0; oc < group_output_channels(); oc++) {
629 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
630 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g] *
631 kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc];
632 }
633 }
634 }
635 }
636 }
637 }
638 }
639 }
640 }
641 } else {
642 for (size_t i = 0; i < batch_size(); i++) {
643 for (size_t oy = 0; oy < output_height(); oy++) {
644 for (size_t ox = 0; ox < output_width(); ox++) {
645 for (size_t ky = 0; ky < kernel_height(); ky++) {
646 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
647 if (iy < input_height()) {
648 for (size_t kx = 0; kx < kernel_width(); kx++) {
649 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
650 if (ix < input_width()) {
651 for (size_t g = 0; g < groups(); g++) {
652 for (size_t oc = 0; oc < group_output_channels(); oc++) {
653 for (size_t ic = 0; ic < group_input_channels(); ic++) {
654 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
655 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
656 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
657 }
658 }
659 }
660 }
661 }
662 }
663 }
664 }
665 }
666 }
667 }
668
669 // Compute clamping parameters.
670 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
671 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
672
673 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
674 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
675
676 // Clamp reference results.
677 for (float& value : output_ref) {
678 value = std::max(std::min(value, output_max), output_min);
679 }
680
681 // Create, setup, run, and destroy Convolution operator.
682 ASSERT_EQ(xnn_status_success, xnn_initialize());
683 xnn_operator_t convolution_op = nullptr;
684
685 ASSERT_EQ(xnn_status_success,
686 xnn_create_convolution2d_nhwc_f32(
687 padding_top(), padding_right(), padding_bottom(), padding_left(),
688 kernel_height(), kernel_width(),
689 subsampling_height(), subsampling_width(),
690 dilation_height(), dilation_width(),
691 groups(), group_input_channels(), group_output_channels(),
692 input_pixel_stride(), output_pixel_stride(),
693 kernel.data(), bias.data(),
694 output_min, output_max,
695 depthwise_layout() ? XNN_CONVOLUTION_FLAG_DEPTHWISE : 0,
696 &convolution_op));
697
698 // Smart pointer to automatically delete convolution_op.
699 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
700
701 ASSERT_EQ(xnn_status_success,
702 xnn_setup_convolution2d_nhwc_f32(
703 convolution_op,
704 batch_size(), input_height(), input_width(),
705 input.data(), output.data(),
706 nullptr /* thread pool */));
707
708 ASSERT_EQ(xnn_status_success,
709 xnn_run_operator(convolution_op, nullptr /* thread pool */));
710
711 // Verify results.
712 for (size_t i = 0; i < batch_size(); i++) {
713 for (size_t y = 0; y < output_height(); y++) {
714 for (size_t x = 0; x < output_width(); x++) {
715 for (size_t g = 0; g < groups(); g++) {
716 for (size_t c = 0; c < group_output_channels(); c++) {
717 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_min)
718 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
719 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_max)
720 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
721 ASSERT_NEAR(
722 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
723 output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c],
724 1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
725 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
726 }
727 }
728 }
729 }
730 }
731 }
732 }
733
734 void TestSetupQ8() const {
735 ASSERT_FALSE(depthwise_layout());
736
737 std::random_device random_device;
738 auto rng = std::mt19937(random_device());
739 auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
740 auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
741
742 std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max(
743 batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()),
744 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_pixel_stride() + groups() * group_input_channels())) + 8);
745 std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
746 std::vector<int32_t> bias(groups() * group_output_channels());
747 std::vector<uint8_t> output(std::max(
748 batch_size() * ((output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()),
749 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_pixel_stride() + groups() * group_output_channels())));
750 std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
751 std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
752 std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
753 std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
754
755 const uint8_t input_zero_point = 127;
756 const uint8_t kernel_zero_point = 127;
757
758 for (size_t iteration = 0; iteration < iterations(); iteration++) {
759 std::generate(input.begin(), input.end(), std::ref(u8rng));
760 std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
761 std::generate(bias.begin(), bias.end(), std::ref(s32rng));
762 std::fill(output.begin(), output.end(), 0xA5);
763
764 // Compute reference results, without renormalization.
765 for (size_t i = 0; i < batch_size(); i++) {
766 for (size_t oy = 0; oy < output_height(); oy++) {
767 for (size_t ox = 0; ox < output_width(); ox++) {
768 for (size_t g = 0; g < groups(); g++) {
769 for (size_t oc = 0; oc < group_output_channels(); oc++) {
770 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
771 bias[g * group_output_channels() + oc];
772 }
773 }
774 }
775 }
776 }
777 for (size_t i = 0; i < batch_size(); i++) {
778 for (size_t oy = 0; oy < output_height(); oy++) {
779 for (size_t ox = 0; ox < output_width(); ox++) {
780 for (size_t ky = 0; ky < kernel_height(); ky++) {
781 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
782 if (iy < input_height()) {
783 for (size_t kx = 0; kx < kernel_width(); kx++) {
784 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
785 if (ix < input_width()) {
786 for (size_t g = 0; g < groups(); g++) {
787 for (size_t oc = 0; oc < group_output_channels(); oc++) {
788 for (size_t ic = 0; ic < group_input_channels(); ic++) {
789 accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
790 (int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
791 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
792 }
793 }
794 }
795 }
796 }
797 }
798 }
799 }
800 }
801 }
802
803 // Compute renormalization parameters.
804 const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
805 const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
806
807 const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
808 const uint8_t output_zero_point = uint8_t(std::max(std::min(
809 lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
810 long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
811
812 // Renormalize reference results.
813 std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
814 [this, output_scale, output_zero_point](int32_t x) -> double {
815 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
816 });
817
818 // Create, setup, and run Convolution operator once.
819 ASSERT_EQ(xnn_status_success, xnn_initialize());
820 xnn_operator_t convolution_op = nullptr;
821
822 ASSERT_EQ(xnn_status_success,
823 xnn_create_convolution2d_nhwc_q8(
824 padding_top(), padding_right(), padding_bottom(), padding_left(),
825 kernel_height(), kernel_width(),
826 subsampling_height(), subsampling_width(),
827 dilation_height(), dilation_width(),
828 groups(), group_input_channels(), group_output_channels(),
829 input_pixel_stride(), output_pixel_stride(),
830 input_zero_point, 1.0f /* input scale */,
831 kernel_zero_point, 1.0f /* kernel scale */,
832 kernel.data(), bias.data(),
833 output_zero_point, output_scale, qmin(), qmax(),
834 0, &convolution_op));
835
836 // Smart pointer to automatically delete convolution_op.
837 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
838
839 ASSERT_EQ(xnn_status_success,
840 xnn_setup_convolution2d_nhwc_q8(
841 convolution_op,
842 batch_size(), input_height(), input_width(),
843 input.data(), output.data(),
844 nullptr /* thread pool */));
845
846 ASSERT_EQ(xnn_status_success,
847 xnn_run_operator(convolution_op, nullptr /* thread pool */));
848
849 // Verify results of the first run.
850 for (size_t i = 0; i < batch_size(); i++) {
851 for (size_t y = 0; y < output_height(); y++) {
852 for (size_t x = 0; x < output_width(); x++) {
853 for (size_t g = 0; g < groups(); g++) {
854 for (size_t c = 0; c < group_output_channels(); c++) {
855 ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
856 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
857 ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
858 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
859 ASSERT_NEAR(
860 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
861 double(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
862 0.9)
863 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
864 }
865 }
866 }
867 }
868 }
869
870 // Re-generate data for the second run.
871 std::generate(input.begin(), input.end(), std::ref(u8rng));
872 std::fill(output.begin(), output.end(), 0xA5);
873
874 // Compute reference results for the second run, including renormalization.
875 for (size_t i = 0; i < next_batch_size(); i++) {
876 for (size_t oy = 0; oy < next_output_height(); oy++) {
877 for (size_t ox = 0; ox < next_output_width(); ox++) {
878 for (size_t g = 0; g < groups(); g++) {
879 for (size_t oc = 0; oc < group_output_channels(); oc++) {
880 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
881 bias[g * group_output_channels() + oc];
882 }
883 }
884 }
885 }
886 }
887 for (size_t i = 0; i < next_batch_size(); i++) {
888 for (size_t oy = 0; oy < next_output_height(); oy++) {
889 for (size_t ox = 0; ox < next_output_width(); ox++) {
890 for (size_t ky = 0; ky < kernel_height(); ky++) {
891 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
892 if (iy < next_input_height()) {
893 for (size_t kx = 0; kx < kernel_width(); kx++) {
894 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
895 if (ix < next_input_width()) {
896 for (size_t g = 0; g < groups(); g++) {
897 for (size_t oc = 0; oc < group_output_channels(); oc++) {
898 for (size_t ic = 0; ic < group_input_channels(); ic++) {
899 next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
900 (int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
901 (int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
902 }
903 }
904 }
905 }
906 }
907 }
908 }
909 }
910 }
911 }
912 std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
913 [this, output_scale, output_zero_point](int32_t x) -> double {
914 return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
915 });
916
917 // Setup and run Convolution operator the second time, and destroy the operator.
918 ASSERT_EQ(xnn_status_success,
919 xnn_setup_convolution2d_nhwc_q8(
920 convolution_op,
921 next_batch_size(), next_input_height(), next_input_width(),
922 input.data(), output.data(),
923 nullptr /* thread pool */));
924
925 ASSERT_EQ(xnn_status_success,
926 xnn_run_operator(convolution_op, nullptr /* thread pool */));
927
928 // Verify results of the second run.
929 for (size_t i = 0; i < next_batch_size(); i++) {
930 for (size_t y = 0; y < next_output_height(); y++) {
931 for (size_t x = 0; x < next_output_width(); x++) {
932 for (size_t g = 0; g < groups(); g++) {
933 for (size_t c = 0; c < group_output_channels(); c++) {
934 ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
935 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
936 ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
937 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
938 ASSERT_NEAR(
939 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
940 double(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
941 0.9)
942 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
943 }
944 }
945 }
946 }
947 }
948 }
949 }
950
951 void TestSetupF32() const {
952 ASSERT_FALSE(depthwise_layout());
953
954 std::random_device random_device;
955 auto rng = std::mt19937(random_device());
956 auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
957
958 std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + std::max(
959 batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()),
960 next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_pixel_stride() + groups() * group_input_channels())));
961 std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
962 std::vector<float> bias(groups() * group_output_channels());
963 std::vector<float> output(std::max(
964 batch_size() * ((output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()),
965 next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_pixel_stride() + groups() * group_output_channels())));
966 std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
967 std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
968
969 for (size_t iteration = 0; iteration < iterations(); iteration++) {
970 std::generate(input.begin(), input.end(), std::ref(f32rng));
971 std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
972 std::generate(bias.begin(), bias.end(), std::ref(f32rng));
973 std::fill(output.begin(), output.end(), nanf(""));
974
975 // Compute reference results, without clamping.
976 for (size_t i = 0; i < batch_size(); i++) {
977 for (size_t oy = 0; oy < output_height(); oy++) {
978 for (size_t ox = 0; ox < output_width(); ox++) {
979 for (size_t g = 0; g < groups(); g++) {
980 for (size_t oc = 0; oc < group_output_channels(); oc++) {
981 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
982 bias[g * group_output_channels() + oc];
983 }
984 }
985 }
986 }
987 }
988 for (size_t i = 0; i < batch_size(); i++) {
989 for (size_t oy = 0; oy < output_height(); oy++) {
990 for (size_t ox = 0; ox < output_width(); ox++) {
991 for (size_t ky = 0; ky < kernel_height(); ky++) {
992 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
993 if (iy < input_height()) {
994 for (size_t kx = 0; kx < kernel_width(); kx++) {
995 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
996 if (ix < input_width()) {
997 for (size_t g = 0; g < groups(); g++) {
998 for (size_t oc = 0; oc < group_output_channels(); oc++) {
999 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1000 output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1001 input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
1002 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1003 }
1004 }
1005 }
1006 }
1007 }
1008 }
1009 }
1010 }
1011 }
1012 }
1013
1014 // Compute clamping parameters.
1015 const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
1016 const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
1017
1018 const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
1019 const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
1020
1021 // Clamp reference results.
1022 for (float& value : output_ref) {
1023 value = std::max(std::min(value, output_max), output_min);
1024 }
1025
1026 // Create, setup, and run Convolution operator once.
1027 ASSERT_EQ(xnn_status_success, xnn_initialize());
1028 xnn_operator_t convolution_op = nullptr;
1029
1030 ASSERT_EQ(xnn_status_success,
1031 xnn_create_convolution2d_nhwc_f32(
1032 padding_top(), padding_right(), padding_bottom(), padding_left(),
1033 kernel_height(), kernel_width(),
1034 subsampling_height(), subsampling_width(),
1035 dilation_height(), dilation_width(),
1036 groups(), group_input_channels(), group_output_channels(),
1037 input_pixel_stride(), output_pixel_stride(),
1038 kernel.data(), bias.data(),
1039 output_min, output_max,
1040 0, &convolution_op));
1041
1042 // Smart pointer to automatically delete convolution_op.
1043 std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
1044
1045 ASSERT_EQ(xnn_status_success,
1046 xnn_setup_convolution2d_nhwc_f32(
1047 convolution_op,
1048 batch_size(), input_height(), input_width(),
1049 input.data(), output.data(),
1050 nullptr /* thread pool */));
1051
1052 ASSERT_EQ(xnn_status_success,
1053 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1054
1055 // Verify results of the first run.
1056 for (size_t i = 0; i < batch_size(); i++) {
1057 for (size_t y = 0; y < output_height(); y++) {
1058 for (size_t x = 0; x < output_width(); x++) {
1059 for (size_t g = 0; g < groups(); g++) {
1060 for (size_t c = 0; c < group_output_channels(); c++) {
1061 ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_min)
1062 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1063 ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_max)
1064 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1065 ASSERT_NEAR(
1066 output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
1067 output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c],
1068 1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
1069 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1070 }
1071 }
1072 }
1073 }
1074 }
1075
1076 // Re-generate data for the second run.
1077 std::generate(input.begin(), input.end(), std::ref(f32rng));
1078 std::fill(output.begin(), output.end(), nanf(""));
1079
1080 // Compute reference results for the second run, including clamping.
1081 for (size_t i = 0; i < next_batch_size(); i++) {
1082 for (size_t oy = 0; oy < next_output_height(); oy++) {
1083 for (size_t ox = 0; ox < next_output_width(); ox++) {
1084 for (size_t g = 0; g < groups(); g++) {
1085 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1086 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
1087 bias[g * group_output_channels() + oc];
1088 }
1089 }
1090 }
1091 }
1092 }
1093 for (size_t i = 0; i < next_batch_size(); i++) {
1094 for (size_t oy = 0; oy < next_output_height(); oy++) {
1095 for (size_t ox = 0; ox < next_output_width(); ox++) {
1096 for (size_t ky = 0; ky < kernel_height(); ky++) {
1097 const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
1098 if (iy < next_input_height()) {
1099 for (size_t kx = 0; kx < kernel_width(); kx++) {
1100 const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
1101 if (ix < next_input_width()) {
1102 for (size_t g = 0; g < groups(); g++) {
1103 for (size_t oc = 0; oc < group_output_channels(); oc++) {
1104 for (size_t ic = 0; ic < group_input_channels(); ic++) {
1105 next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
1106 input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
1107 kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
1108 }
1109 }
1110 }
1111 }
1112 }
1113 }
1114 }
1115 }
1116 }
1117 }
1118 for (float& value : next_output_ref) {
1119 value = std::max(std::min(value, output_max), output_min);
1120 }
1121
1122 // Setup and run Convolution operator the second time, and destroy the operator.
1123 ASSERT_EQ(xnn_status_success,
1124 xnn_setup_convolution2d_nhwc_f32(
1125 convolution_op,
1126 next_batch_size(), next_input_height(), next_input_width(),
1127 input.data(), output.data(),
1128 nullptr /* thread pool */));
1129
1130 ASSERT_EQ(xnn_status_success,
1131 xnn_run_operator(convolution_op, nullptr /* thread pool */));
1132
1133 // Verify results of the second run.
1134 for (size_t i = 0; i < next_batch_size(); i++) {
1135 for (size_t y = 0; y < next_output_height(); y++) {
1136 for (size_t x = 0; x < next_output_width(); x++) {
1137 for (size_t g = 0; g < groups(); g++) {
1138 for (size_t c = 0; c < group_output_channels(); c++) {
1139 ASSERT_GE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_min)
1140 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1141 ASSERT_LE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_max)
1142 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1143 ASSERT_NEAR(
1144 next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
1145 output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c],
1146 1.0e-4 * std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]))
1147 << "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
1148 }
1149 }
1150 }
1151 }
1152 }
1153 }
1154 }
1155
1156 private:
1157 uint32_t padding_top_{0};
1158 uint32_t padding_right_{0};
1159 uint32_t padding_bottom_{0};
1160 uint32_t padding_left_{0};
1161 size_t input_height_{1};
1162 size_t input_width_{1};
1163 uint32_t groups_{1};
1164 size_t group_input_channels_{1};
1165 size_t input_pixel_stride_{0};
1166 size_t group_output_channels_{1};
1167 size_t output_pixel_stride_{0};
1168 size_t batch_size_{1};
1169 uint32_t kernel_height_{1};
1170 uint32_t kernel_width_{1};
1171 uint32_t dilation_height_{1};
1172 uint32_t dilation_width_{1};
1173 uint32_t subsampling_height_{1};
1174 uint32_t subsampling_width_{1};
1175 size_t next_input_height_{0};
1176 size_t next_input_width_{0};
1177 size_t next_batch_size_{0};
1178 uint8_t qmin_{0};
1179 uint8_t qmax_{255};
1180 bool depthwise_layout_{false};
1181 size_t iterations_{1};
1182};