Blame - test/convolution-operator-tester.h - platform/external/XNNPACK

blob: 92c08c446f47f2661b5fd8a554a6aedee1737ea0 [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame^]	1	// Copyright (c) Facebook, Inc. and its affiliates.
				2	// All rights reserved.
				3	//
				4	// Copyright 2019 Google LLC
				5	//
				6	// This source code is licensed under the BSD-style license found in the
				7	// LICENSE file in the root directory of this source tree.
				8
				9	#pragma once
				10
				11	#include <gtest/gtest.h>
				12
				13	#include <algorithm>
				14	#include <cassert>
				15	#include <cmath>
				16	#include <cstddef>
				17	#include <cstdlib>
				18	#include <functional>
				19	#include <random>
				20	#include <vector>
				21
				22	#include <xnnpack.h>
				23
				24
				25	class ConvolutionOperatorTester {
				26	public:
				27	inline ConvolutionOperatorTester& padding(uint32_t padding) {
				28	this->padding_top_ = padding;
				29	this->padding_right_ = padding;
				30	this->padding_bottom_ = padding;
				31	this->padding_left_ = padding;
				32	return *this;
				33	}
				34
				35	inline ConvolutionOperatorTester& padding(uint32_t padding_height, uint32_t padding_width) {
				36	this->padding_top_ = padding_height;
				37	this->padding_right_ = padding_width;
				38	this->padding_bottom_ = padding_height;
				39	this->padding_left_ = padding_width;
				40	return *this;
				41	}
				42
				43	inline ConvolutionOperatorTester& padding_height(uint32_t padding_height) {
				44	this->padding_top_ = padding_height;
				45	this->padding_bottom_ = padding_height;
				46	return *this;
				47	}
				48
				49	inline ConvolutionOperatorTester& padding_width(uint32_t padding_width) {
				50	this->padding_right_ = padding_width;
				51	this->padding_left_ = padding_width;
				52	return *this;
				53	}
				54
				55	inline ConvolutionOperatorTester& padding_top(uint32_t padding_top) {
				56	this->padding_top_ = padding_top;
				57	return *this;
				58	}
				59
				60	inline uint32_t padding_top() const {
				61	return this->padding_top_;
				62	}
				63
				64	inline ConvolutionOperatorTester& padding_right(uint32_t padding_right) {
				65	this->padding_right_ = padding_right;
				66	return *this;
				67	}
				68
				69	inline uint32_t padding_right() const {
				70	return this->padding_right_;
				71	}
				72
				73	inline ConvolutionOperatorTester& padding_bottom(uint32_t padding_bottom) {
				74	this->padding_bottom_ = padding_bottom;
				75	return *this;
				76	}
				77
				78	inline uint32_t padding_bottom() const {
				79	return this->padding_bottom_;
				80	}
				81
				82	inline ConvolutionOperatorTester& padding_left(uint32_t padding_left) {
				83	this->padding_left_ = padding_left;
				84	return *this;
				85	}
				86
				87	inline uint32_t padding_left() const {
				88	return this->padding_left_;
				89	}
				90
				91	inline ConvolutionOperatorTester& input_size(uint32_t input_height, uint32_t input_width) {
				92	assert(input_height >= 1);
				93	assert(input_width >= 1);
				94	this->input_height_ = input_height;
				95	this->input_width_ = input_width;
				96	return *this;
				97	}
				98
				99	inline ConvolutionOperatorTester& input_height(uint32_t input_height) {
				100	assert(input_height >= 1);
				101	this->input_height_ = input_height;
				102	return *this;
				103	}
				104
				105	inline uint32_t input_height() const {
				106	return this->input_height_;
				107	}
				108
				109	inline ConvolutionOperatorTester& input_width(uint32_t input_width) {
				110	assert(input_width >= 1);
				111	this->input_width_ = input_width;
				112	return *this;
				113	}
				114
				115	inline uint32_t input_width() const {
				116	return this->input_width_;
				117	}
				118
				119	inline ConvolutionOperatorTester& groups(uint32_t groups) {
				120	assert(groups >= 1);
				121	this->groups_ = groups;
				122	return *this;
				123	}
				124
				125	inline uint32_t groups() const {
				126	return this->groups_;
				127	}
				128
				129	inline ConvolutionOperatorTester& group_input_channels(size_t group_input_channels) {
				130	assert(group_input_channels >= 1);
				131	this->group_input_channels_ = group_input_channels;
				132	return *this;
				133	}
				134
				135	inline size_t group_input_channels() const {
				136	return this->group_input_channels_;
				137	}
				138
				139	inline ConvolutionOperatorTester& group_output_channels(size_t group_output_channels) {
				140	assert(group_output_channels >= 1);
				141	this->group_output_channels_ = group_output_channels;
				142	return *this;
				143	}
				144
				145	inline size_t group_output_channels() const {
				146	return this->group_output_channels_;
				147	}
				148
				149	inline ConvolutionOperatorTester& batch_size(size_t batch_size) {
				150	assert(batch_size >= 1);
				151	this->batch_size_ = batch_size;
				152	return *this;
				153	}
				154
				155	inline size_t batch_size() const {
				156	return this->batch_size_;
				157	}
				158
				159	inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_size) {
				160	assert(kernel_size >= 1);
				161	this->kernel_height_ = kernel_size;
				162	this->kernel_width_ = kernel_size;
				163	return *this;
				164	}
				165
				166	inline ConvolutionOperatorTester& kernel_size(uint32_t kernel_height, uint32_t kernel_width) {
				167	assert(kernel_height >= 1);
				168	assert(kernel_width >= 1);
				169	this->kernel_height_ = kernel_height;
				170	this->kernel_width_ = kernel_width;
				171	return *this;
				172	}
				173
				174	inline ConvolutionOperatorTester& kernel_height(uint32_t kernel_height) {
				175	assert(kernel_height >= 1);
				176	this->kernel_height_ = kernel_height;
				177	return *this;
				178	}
				179
				180	inline uint32_t kernel_height() const {
				181	return this->kernel_height_;
				182	}
				183
				184	inline ConvolutionOperatorTester& kernel_width(uint32_t kernel_width) {
				185	assert(kernel_width >= 1);
				186	this->kernel_width_ = kernel_width;
				187	return *this;
				188	}
				189
				190	inline uint32_t kernel_width() const {
				191	return this->kernel_width_;
				192	}
				193
				194	inline ConvolutionOperatorTester& dilation(uint32_t dilation) {
				195	assert(dilation >= 1);
				196	this->dilation_height_ = dilation;
				197	this->dilation_width_ = dilation;
				198	return *this;
				199	}
				200
				201	inline ConvolutionOperatorTester& dilation(uint32_t dilation_height, uint32_t dilation_width) {
				202	assert(dilation_height >= 1);
				203	assert(dilation_width >= 1);
				204	this->dilation_height_ = dilation_height;
				205	this->dilation_width_ = dilation_width;
				206	return *this;
				207	}
				208
				209	inline ConvolutionOperatorTester& dilation_height(uint32_t dilation_height) {
				210	assert(dilation_height >= 1);
				211	this->dilation_height_ = dilation_height;
				212	return *this;
				213	}
				214
				215	inline uint32_t dilation_height() const {
				216	return this->dilation_height_;
				217	}
				218
				219	inline ConvolutionOperatorTester& dilation_width(uint32_t dilation_width) {
				220	assert(dilation_width >= 1);
				221	this->dilation_width_ = dilation_width;
				222	return *this;
				223	}
				224
				225	inline uint32_t dilation_width() const {
				226	return this->dilation_width_;
				227	}
				228
				229	inline ConvolutionOperatorTester& subsampling(uint32_t subsampling) {
				230	assert(subsampling >= 1);
				231	this->subsampling_height_ = subsampling;
				232	this->subsampling_width_ = subsampling;
				233	return *this;
				234	}
				235
				236	inline ConvolutionOperatorTester& subsampling(uint32_t subsampling_height, uint32_t subsampling_width) {
				237	assert(subsampling_height >= 1);
				238	assert(subsampling_width >= 1);
				239	this->subsampling_height_ = subsampling_height;
				240	this->subsampling_width_ = subsampling_width;
				241	return *this;
				242	}
				243
				244	inline ConvolutionOperatorTester& subsampling_height(uint32_t subsampling_height) {
				245	assert(subsampling_height >= 1);
				246	this->subsampling_height_ = subsampling_height;
				247	return *this;
				248	}
				249
				250	inline uint32_t subsampling_height() const {
				251	return this->subsampling_height_;
				252	}
				253
				254	inline ConvolutionOperatorTester& subsampling_width(uint32_t subsampling_width) {
				255	assert(subsampling_width >= 1);
				256	this->subsampling_width_ = subsampling_width;
				257	return *this;
				258	}
				259
				260	inline uint32_t subsampling_width() const {
				261	return this->subsampling_width_;
				262	}
				263
				264	inline ConvolutionOperatorTester& input_pixel_stride(size_t input_pixel_stride) {
				265	assert(input_pixel_stride >= 1);
				266	this->input_pixel_stride_ = input_pixel_stride;
				267	return *this;
				268	}
				269
				270	inline size_t input_pixel_stride() const {
				271	if (this->input_pixel_stride_ == 0) {
				272	return group_input_channels() * groups();
				273	} else {
				274	assert(this->input_pixel_stride_ >= group_input_channels() * groups());
				275	return this->input_pixel_stride_;
				276	}
				277	}
				278
				279	inline ConvolutionOperatorTester& output_pixel_stride(size_t output_pixel_stride) {
				280	assert(output_pixel_stride >= 1);
				281	this->output_pixel_stride_ = output_pixel_stride;
				282	return *this;
				283	}
				284
				285	inline size_t output_pixel_stride() const {
				286	if (this->output_pixel_stride_ == 0) {
				287	return group_output_channels() * groups();
				288	} else {
				289	assert(this->output_pixel_stride_ >= group_output_channels() * groups());
				290	return this->output_pixel_stride_;
				291	}
				292	}
				293
				294	inline uint32_t dilated_kernel_height() const {
				295	return (kernel_height() - 1) * dilation_height() + 1;
				296	}
				297
				298	inline uint32_t dilated_kernel_width() const {
				299	return (kernel_width() - 1) * dilation_width() + 1;
				300	}
				301
				302	inline size_t output_height() const {
				303	const size_t padded_input_height = padding_top() + input_height() + padding_bottom();
				304	if (padded_input_height <= dilated_kernel_height()) {
				305	return 1;
				306	} else {
				307	return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1;
				308	}
				309	}
				310
				311	inline size_t output_width() const {
				312	const size_t padded_input_width = padding_left() + input_width() + padding_right();
				313	if (padded_input_width <= dilated_kernel_width()) {
				314	return 1;
				315	} else {
				316	return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1;
				317	}
				318	}
				319
				320	inline ConvolutionOperatorTester& next_input_size(uint32_t next_input_height, uint32_t next_input_width) {
				321	assert(next_input_height >= 1);
				322	assert(next_input_width >= 1);
				323	this->next_input_height_ = next_input_height;
				324	this->next_input_width_ = next_input_width;
				325	return *this;
				326	}
				327
				328	inline ConvolutionOperatorTester& next_input_height(uint32_t next_input_height) {
				329	assert(next_input_height >= 1);
				330	this->next_input_height_ = next_input_height;
				331	return *this;
				332	}
				333
				334	inline uint32_t next_input_height() const {
				335	if (this->next_input_height_ == 0) {
				336	return input_height();
				337	} else {
				338	return this->next_input_height_;
				339	}
				340	}
				341
				342	inline ConvolutionOperatorTester& next_input_width(uint32_t next_input_width) {
				343	assert(next_input_width >= 1);
				344	this->next_input_width_ = next_input_width;
				345	return *this;
				346	}
				347
				348	inline uint32_t next_input_width() const {
				349	if (this->next_input_width_ == 0) {
				350	return input_width();
				351	} else {
				352	return this->next_input_width_;
				353	}
				354	}
				355
				356	inline size_t next_output_height() const {
				357	const size_t padded_input_height = padding_top() + next_input_height() + padding_bottom();
				358	if (padded_input_height <= dilated_kernel_height()) {
				359	return 1;
				360	} else {
				361	return (padded_input_height - dilated_kernel_height()) / subsampling_height() + 1;
				362	}
				363	}
				364
				365	inline size_t next_output_width() const {
				366	const size_t padded_input_width = padding_left() + next_input_width() + padding_right();
				367	if (padded_input_width <= dilated_kernel_width()) {
				368	return 1;
				369	} else {
				370	return (padded_input_width - dilated_kernel_width()) / subsampling_width() + 1;
				371	}
				372	}
				373
				374	inline ConvolutionOperatorTester& next_batch_size(size_t next_batch_size) {
				375	assert(next_batch_size >= 1);
				376	this->next_batch_size_ = next_batch_size;
				377	return *this;
				378	}
				379
				380	inline size_t next_batch_size() const {
				381	if (this->next_batch_size_ == 0) {
				382	return batch_size();
				383	} else {
				384	return this->next_batch_size_;
				385	}
				386	}
				387
				388	inline ConvolutionOperatorTester& qmin(uint8_t qmin) {
				389	this->qmin_ = qmin;
				390	return *this;
				391	}
				392
				393	inline uint8_t qmin() const {
				394	return this->qmin_;
				395	}
				396
				397	inline ConvolutionOperatorTester& qmax(uint8_t qmax) {
				398	this->qmax_ = qmax;
				399	return *this;
				400	}
				401
				402	inline uint8_t qmax() const {
				403	return this->qmax_;
				404	}
				405
				406	inline ConvolutionOperatorTester& depthwise_layout(bool depthwise_layout) {
				407	this->depthwise_layout_ = depthwise_layout;
				408	return *this;
				409	}
				410
				411	inline bool depthwise_layout() const {
				412	return this->depthwise_layout_;
				413	}
				414
				415	inline ConvolutionOperatorTester& iterations(size_t iterations) {
				416	this->iterations_ = iterations;
				417	return *this;
				418	}
				419
				420	inline size_t iterations() const {
				421	return this->iterations_;
				422	}
				423
				424	void TestQ8() const {
				425	std::random_device random_device;
				426	auto rng = std::mt19937(random_device());
				427	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
				428	auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
				429
				430	std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) +
				431	batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()) + 8);
				432	std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
				433	std::vector<int32_t> bias(groups() * group_output_channels());
				434	std::vector<uint8_t> output(batch_size() * ((output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()));
				435	std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
				436	std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
				437
				438	const uint8_t input_zero_point = 127;
				439	const uint8_t kernel_zero_point = 127;
				440
				441	for (size_t iteration = 0; iteration < iterations(); iteration++) {
				442	std::generate(input.begin(), input.end(), std::ref(u8rng));
				443	std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
				444	std::generate(bias.begin(), bias.end(), std::ref(s32rng));
				445	std::fill(output.begin(), output.end(), 0xA5);
				446
				447	// Compute reference results, without renormalization.
				448	for (size_t i = 0; i < batch_size(); i++) {
				449	for (size_t oy = 0; oy < output_height(); oy++) {
				450	for (size_t ox = 0; ox < output_width(); ox++) {
				451	for (size_t g = 0; g < groups(); g++) {
				452	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				453	accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
				454	bias[g * group_output_channels() + oc];
				455	}
				456	}
				457	}
				458	}
				459	}
				460	if (depthwise_layout()) {
				461	ASSERT_EQ(group_input_channels(), 1);
				462
				463	for (size_t i = 0; i < batch_size(); i++) {
				464	for (size_t oy = 0; oy < output_height(); oy++) {
				465	for (size_t ox = 0; ox < output_width(); ox++) {
				466	for (size_t ky = 0; ky < kernel_height(); ky++) {
				467	const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
				468	if (iy < input_height()) {
				469	for (size_t kx = 0; kx < kernel_width(); kx++) {
				470	const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
				471	if (ix < input_width()) {
				472	for (size_t g = 0; g < groups(); g++) {
				473	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				474	accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
				475	(int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g]) - int32_t(input_zero_point)) *
				476	(int32_t(kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc]) - int32_t(kernel_zero_point));
				477	}
				478	}
				479	}
				480	}
				481	}
				482	}
				483	}
				484	}
				485	}
				486	} else {
				487	for (size_t i = 0; i < batch_size(); i++) {
				488	for (size_t oy = 0; oy < output_height(); oy++) {
				489	for (size_t ox = 0; ox < output_width(); ox++) {
				490	for (size_t ky = 0; ky < kernel_height(); ky++) {
				491	const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
				492	if (iy < input_height()) {
				493	for (size_t kx = 0; kx < kernel_width(); kx++) {
				494	const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
				495	if (ix < input_width()) {
				496	for (size_t g = 0; g < groups(); g++) {
				497	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				498	for (size_t ic = 0; ic < group_input_channels(); ic++) {
				499	accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
				500	(int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
				501	(int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
				502	}
				503	}
				504	}
				505	}
				506	}
				507	}
				508	}
				509	}
				510	}
				511	}
				512	}
				513
				514	// Compute renormalization parameters.
				515	const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
				516	const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
				517
				518	const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
				519	const uint8_t output_zero_point = uint8_t(std::max(std::min(
				520	lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
				521	long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
				522
				523	// Renormalize reference results.
				524	std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
				525	[this, output_scale, output_zero_point](int32_t x) -> double {
				526	return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
				527	});
				528
				529	// Create, setup, run, and destroy Convolution operator.
				530	ASSERT_EQ(xnn_status_success, xnn_initialize());
				531	xnn_operator_t convolution_op = nullptr;
				532
				533	ASSERT_EQ(xnn_status_success,
				534	xnn_create_convolution2d_nhwc_q8(
				535	padding_top(), padding_right(), padding_bottom(), padding_left(),
				536	kernel_height(), kernel_width(),
				537	subsampling_height(), subsampling_width(),
				538	dilation_height(), dilation_width(),
				539	groups(), group_input_channels(), group_output_channels(),
				540	input_pixel_stride(), output_pixel_stride(),
				541	input_zero_point, 1.0f /* input scale */,
				542	kernel_zero_point, 1.0f /* kernel scale */,
				543	kernel.data(), bias.data(),
				544	output_zero_point, output_scale, qmin(), qmax(),
				545	depthwise_layout() ? XNN_CONVOLUTION_FLAG_DEPTHWISE : 0,
				546	&convolution_op));
				547
				548	// Smart pointer to automatically delete convolution_op.
				549	std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
				550
				551	ASSERT_EQ(xnn_status_success,
				552	xnn_setup_convolution2d_nhwc_q8(
				553	convolution_op,
				554	batch_size(), input_height(), input_width(),
				555	input.data(), output.data(),
				556	nullptr /* thread pool */));
				557
				558	ASSERT_EQ(xnn_status_success,
				559	xnn_run_operator(convolution_op, nullptr /* thread pool */));
				560
				561	// Verify results.
				562	for (size_t i = 0; i < batch_size(); i++) {
				563	for (size_t y = 0; y < output_height(); y++) {
				564	for (size_t x = 0; x < output_width(); x++) {
				565	for (size_t g = 0; g < groups(); g++) {
				566	for (size_t c = 0; c < group_output_channels(); c++) {
				567	ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
				568	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				569	ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
				570	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				571	ASSERT_NEAR(
				572	output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
				573	double(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
				574	0.9)
				575	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				576	}
				577	}
				578	}
				579	}
				580	}
				581	}
				582	}
				583
				584	void TestF32() const {
				585	std::random_device random_device;
				586	auto rng = std::mt19937(random_device());
				587	auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
				588
				589	std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
				590	batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()));
				591	std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
				592	std::vector<float> bias(groups() * group_output_channels());
				593	std::vector<float> output(batch_size() * ((output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()));
				594	std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
				595
				596	for (size_t iteration = 0; iteration < iterations(); iteration++) {
				597	std::generate(input.begin(), input.end(), std::ref(f32rng));
				598	std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
				599	std::generate(bias.begin(), bias.end(), std::ref(f32rng));
				600	std::fill(output.begin(), output.end(), nanf(""));
				601
				602	// Compute reference results, without clamping.
				603	for (size_t i = 0; i < batch_size(); i++) {
				604	for (size_t oy = 0; oy < output_height(); oy++) {
				605	for (size_t ox = 0; ox < output_width(); ox++) {
				606	for (size_t g = 0; g < groups(); g++) {
				607	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				608	output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
				609	bias[g * group_output_channels() + oc];
				610	}
				611	}
				612	}
				613	}
				614	}
				615	if (depthwise_layout()) {
				616	ASSERT_EQ(group_input_channels(), 1);
				617
				618	for (size_t i = 0; i < batch_size(); i++) {
				619	for (size_t oy = 0; oy < output_height(); oy++) {
				620	for (size_t ox = 0; ox < output_width(); ox++) {
				621	for (size_t ky = 0; ky < kernel_height(); ky++) {
				622	const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
				623	if (iy < input_height()) {
				624	for (size_t kx = 0; kx < kernel_width(); kx++) {
				625	const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
				626	if (ix < input_width()) {
				627	for (size_t g = 0; g < groups(); g++) {
				628	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				629	output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
				630	input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g] *
				631	kernel[((ky * kernel_width() + kx) * groups() + g) * group_output_channels() + oc];
				632	}
				633	}
				634	}
				635	}
				636	}
				637	}
				638	}
				639	}
				640	}
				641	} else {
				642	for (size_t i = 0; i < batch_size(); i++) {
				643	for (size_t oy = 0; oy < output_height(); oy++) {
				644	for (size_t ox = 0; ox < output_width(); ox++) {
				645	for (size_t ky = 0; ky < kernel_height(); ky++) {
				646	const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
				647	if (iy < input_height()) {
				648	for (size_t kx = 0; kx < kernel_width(); kx++) {
				649	const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
				650	if (ix < input_width()) {
				651	for (size_t g = 0; g < groups(); g++) {
				652	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				653	for (size_t ic = 0; ic < group_input_channels(); ic++) {
				654	output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
				655	input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
				656	kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
				657	}
				658	}
				659	}
				660	}
				661	}
				662	}
				663	}
				664	}
				665	}
				666	}
				667	}
				668
				669	// Compute clamping parameters.
				670	const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
				671	const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
				672
				673	const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
				674	const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
				675
				676	// Clamp reference results.
				677	for (float& value : output_ref) {
				678	value = std::max(std::min(value, output_max), output_min);
				679	}
				680
				681	// Create, setup, run, and destroy Convolution operator.
				682	ASSERT_EQ(xnn_status_success, xnn_initialize());
				683	xnn_operator_t convolution_op = nullptr;
				684
				685	ASSERT_EQ(xnn_status_success,
				686	xnn_create_convolution2d_nhwc_f32(
				687	padding_top(), padding_right(), padding_bottom(), padding_left(),
				688	kernel_height(), kernel_width(),
				689	subsampling_height(), subsampling_width(),
				690	dilation_height(), dilation_width(),
				691	groups(), group_input_channels(), group_output_channels(),
				692	input_pixel_stride(), output_pixel_stride(),
				693	kernel.data(), bias.data(),
				694	output_min, output_max,
				695	depthwise_layout() ? XNN_CONVOLUTION_FLAG_DEPTHWISE : 0,
				696	&convolution_op));
				697
				698	// Smart pointer to automatically delete convolution_op.
				699	std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
				700
				701	ASSERT_EQ(xnn_status_success,
				702	xnn_setup_convolution2d_nhwc_f32(
				703	convolution_op,
				704	batch_size(), input_height(), input_width(),
				705	input.data(), output.data(),
				706	nullptr /* thread pool */));
				707
				708	ASSERT_EQ(xnn_status_success,
				709	xnn_run_operator(convolution_op, nullptr /* thread pool */));
				710
				711	// Verify results.
				712	for (size_t i = 0; i < batch_size(); i++) {
				713	for (size_t y = 0; y < output_height(); y++) {
				714	for (size_t x = 0; x < output_width(); x++) {
				715	for (size_t g = 0; g < groups(); g++) {
				716	for (size_t c = 0; c < group_output_channels(); c++) {
				717	ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_min)
				718	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				719	ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_max)
				720	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				721	ASSERT_NEAR(
				722	output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
				723	output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c],
				724	1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
				725	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				726	}
				727	}
				728	}
				729	}
				730	}
				731	}
				732	}
				733
				734	void TestSetupQ8() const {
				735	ASSERT_FALSE(depthwise_layout());
				736
				737	std::random_device random_device;
				738	auto rng = std::mt19937(random_device());
				739	auto s32rng = std::bind(std::uniform_int_distribution<int32_t>(-10000, 10000), rng);
				740	auto u8rng = std::bind(std::uniform_int_distribution<uint8_t>(), rng);
				741
				742	std::vector<uint8_t> input(XNN_EXTRA_BYTES / sizeof(uint8_t) + std::max(
				743	batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()),
				744	next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_pixel_stride() + groups() * group_input_channels())) + 8);
				745	std::vector<uint8_t> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
				746	std::vector<int32_t> bias(groups() * group_output_channels());
				747	std::vector<uint8_t> output(std::max(
				748	batch_size() * ((output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()),
				749	next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_pixel_stride() + groups() * group_output_channels())));
				750	std::vector<int32_t> accumulators(batch_size() * output_height() * output_width() * groups() * group_output_channels());
				751	std::vector<double> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
				752	std::vector<int32_t> next_accumulators(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
				753	std::vector<double> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
				754
				755	const uint8_t input_zero_point = 127;
				756	const uint8_t kernel_zero_point = 127;
				757
				758	for (size_t iteration = 0; iteration < iterations(); iteration++) {
				759	std::generate(input.begin(), input.end(), std::ref(u8rng));
				760	std::generate(kernel.begin(), kernel.end(), std::ref(u8rng));
				761	std::generate(bias.begin(), bias.end(), std::ref(s32rng));
				762	std::fill(output.begin(), output.end(), 0xA5);
				763
				764	// Compute reference results, without renormalization.
				765	for (size_t i = 0; i < batch_size(); i++) {
				766	for (size_t oy = 0; oy < output_height(); oy++) {
				767	for (size_t ox = 0; ox < output_width(); ox++) {
				768	for (size_t g = 0; g < groups(); g++) {
				769	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				770	accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
				771	bias[g * group_output_channels() + oc];
				772	}
				773	}
				774	}
				775	}
				776	}
				777	for (size_t i = 0; i < batch_size(); i++) {
				778	for (size_t oy = 0; oy < output_height(); oy++) {
				779	for (size_t ox = 0; ox < output_width(); ox++) {
				780	for (size_t ky = 0; ky < kernel_height(); ky++) {
				781	const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
				782	if (iy < input_height()) {
				783	for (size_t kx = 0; kx < kernel_width(); kx++) {
				784	const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
				785	if (ix < input_width()) {
				786	for (size_t g = 0; g < groups(); g++) {
				787	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				788	for (size_t ic = 0; ic < group_input_channels(); ic++) {
				789	accumulators[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
				790	(int32_t(input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
				791	(int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
				792	}
				793	}
				794	}
				795	}
				796	}
				797	}
				798	}
				799	}
				800	}
				801	}
				802
				803	// Compute renormalization parameters.
				804	const int32_t accumulated_min = *std::min_element(accumulators.cbegin(), accumulators.cend());
				805	const int32_t accumulated_max = *std::max_element(accumulators.cbegin(), accumulators.cend());
				806
				807	const double output_scale = double(uint32_t(accumulated_max - accumulated_min)) / 255.0;
				808	const uint8_t output_zero_point = uint8_t(std::max(std::min(
				809	lrint(127.5 - 0.5 * double(accumulated_min + accumulated_max) / output_scale),
				810	long(std::numeric_limits<uint8_t>::max())), long(std::numeric_limits<uint8_t>::min())));
				811
				812	// Renormalize reference results.
				813	std::transform(accumulators.cbegin(), accumulators.cend(), output_ref.begin(),
				814	[this, output_scale, output_zero_point](int32_t x) -> double {
				815	return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
				816	});
				817
				818	// Create, setup, and run Convolution operator once.
				819	ASSERT_EQ(xnn_status_success, xnn_initialize());
				820	xnn_operator_t convolution_op = nullptr;
				821
				822	ASSERT_EQ(xnn_status_success,
				823	xnn_create_convolution2d_nhwc_q8(
				824	padding_top(), padding_right(), padding_bottom(), padding_left(),
				825	kernel_height(), kernel_width(),
				826	subsampling_height(), subsampling_width(),
				827	dilation_height(), dilation_width(),
				828	groups(), group_input_channels(), group_output_channels(),
				829	input_pixel_stride(), output_pixel_stride(),
				830	input_zero_point, 1.0f /* input scale */,
				831	kernel_zero_point, 1.0f /* kernel scale */,
				832	kernel.data(), bias.data(),
				833	output_zero_point, output_scale, qmin(), qmax(),
				834	0, &convolution_op));
				835
				836	// Smart pointer to automatically delete convolution_op.
				837	std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
				838
				839	ASSERT_EQ(xnn_status_success,
				840	xnn_setup_convolution2d_nhwc_q8(
				841	convolution_op,
				842	batch_size(), input_height(), input_width(),
				843	input.data(), output.data(),
				844	nullptr /* thread pool */));
				845
				846	ASSERT_EQ(xnn_status_success,
				847	xnn_run_operator(convolution_op, nullptr /* thread pool */));
				848
				849	// Verify results of the first run.
				850	for (size_t i = 0; i < batch_size(); i++) {
				851	for (size_t y = 0; y < output_height(); y++) {
				852	for (size_t x = 0; x < output_width(); x++) {
				853	for (size_t g = 0; g < groups(); g++) {
				854	for (size_t c = 0; c < group_output_channels(); c++) {
				855	ASSERT_LE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
				856	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				857	ASSERT_GE(int32_t(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
				858	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				859	ASSERT_NEAR(
				860	output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
				861	double(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
				862	0.9)
				863	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				864	}
				865	}
				866	}
				867	}
				868	}
				869
				870	// Re-generate data for the second run.
				871	std::generate(input.begin(), input.end(), std::ref(u8rng));
				872	std::fill(output.begin(), output.end(), 0xA5);
				873
				874	// Compute reference results for the second run, including renormalization.
				875	for (size_t i = 0; i < next_batch_size(); i++) {
				876	for (size_t oy = 0; oy < next_output_height(); oy++) {
				877	for (size_t ox = 0; ox < next_output_width(); ox++) {
				878	for (size_t g = 0; g < groups(); g++) {
				879	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				880	next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
				881	bias[g * group_output_channels() + oc];
				882	}
				883	}
				884	}
				885	}
				886	}
				887	for (size_t i = 0; i < next_batch_size(); i++) {
				888	for (size_t oy = 0; oy < next_output_height(); oy++) {
				889	for (size_t ox = 0; ox < next_output_width(); ox++) {
				890	for (size_t ky = 0; ky < kernel_height(); ky++) {
				891	const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
				892	if (iy < next_input_height()) {
				893	for (size_t kx = 0; kx < kernel_width(); kx++) {
				894	const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
				895	if (ix < next_input_width()) {
				896	for (size_t g = 0; g < groups(); g++) {
				897	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				898	for (size_t ic = 0; ic < group_input_channels(); ic++) {
				899	next_accumulators[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
				900	(int32_t(input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic]) - int32_t(input_zero_point)) *
				901	(int32_t(kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic]) - int32_t(kernel_zero_point));
				902	}
				903	}
				904	}
				905	}
				906	}
				907	}
				908	}
				909	}
				910	}
				911	}
				912	std::transform(next_accumulators.cbegin(), next_accumulators.cend(), next_output_ref.begin(),
				913	[this, output_scale, output_zero_point](int32_t x) -> double {
				914	return std::max<double>(std::min<double>(double(x) / output_scale, double(qmax()) - output_zero_point), double(qmin()) - output_zero_point);
				915	});
				916
				917	// Setup and run Convolution operator the second time, and destroy the operator.
				918	ASSERT_EQ(xnn_status_success,
				919	xnn_setup_convolution2d_nhwc_q8(
				920	convolution_op,
				921	next_batch_size(), next_input_height(), next_input_width(),
				922	input.data(), output.data(),
				923	nullptr /* thread pool */));
				924
				925	ASSERT_EQ(xnn_status_success,
				926	xnn_run_operator(convolution_op, nullptr /* thread pool */));
				927
				928	// Verify results of the second run.
				929	for (size_t i = 0; i < next_batch_size(); i++) {
				930	for (size_t y = 0; y < next_output_height(); y++) {
				931	for (size_t x = 0; x < next_output_width(); x++) {
				932	for (size_t g = 0; g < groups(); g++) {
				933	for (size_t c = 0; c < group_output_channels(); c++) {
				934	ASSERT_LE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmax()))
				935	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				936	ASSERT_GE(int32_t(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]), int32_t(qmin()))
				937	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				938	ASSERT_NEAR(
				939	next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
				940	double(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c]) - double(output_zero_point),
				941	0.9)
				942	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				943	}
				944	}
				945	}
				946	}
				947	}
				948	}
				949	}
				950
				951	void TestSetupF32() const {
				952	ASSERT_FALSE(depthwise_layout());
				953
				954	std::random_device random_device;
				955	auto rng = std::mt19937(random_device());
				956	auto f32rng = std::bind(std::uniform_real_distribution<float>(0.1f, 1.0f), rng);
				957
				958	std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) + std::max(
				959	batch_size() * ((input_height() * input_width() - 1) * input_pixel_stride() + groups() * group_input_channels()),
				960	next_batch_size() * ((next_input_height() * next_input_width() - 1) * input_pixel_stride() + groups() * group_input_channels())));
				961	std::vector<float> kernel(groups() * group_output_channels() * kernel_height() * kernel_width() * group_input_channels());
				962	std::vector<float> bias(groups() * group_output_channels());
				963	std::vector<float> output(std::max(
				964	batch_size() * ((output_height() * output_width() - 1) * output_pixel_stride() + groups() * group_output_channels()),
				965	next_batch_size() * ((next_output_height() * next_output_width() - 1) * output_pixel_stride() + groups() * group_output_channels())));
				966	std::vector<float> output_ref(batch_size() * output_height() * output_width() * groups() * group_output_channels());
				967	std::vector<float> next_output_ref(next_batch_size() * next_output_height() * next_output_width() * groups() * group_output_channels());
				968
				969	for (size_t iteration = 0; iteration < iterations(); iteration++) {
				970	std::generate(input.begin(), input.end(), std::ref(f32rng));
				971	std::generate(kernel.begin(), kernel.end(), std::ref(f32rng));
				972	std::generate(bias.begin(), bias.end(), std::ref(f32rng));
				973	std::fill(output.begin(), output.end(), nanf(""));
				974
				975	// Compute reference results, without clamping.
				976	for (size_t i = 0; i < batch_size(); i++) {
				977	for (size_t oy = 0; oy < output_height(); oy++) {
				978	for (size_t ox = 0; ox < output_width(); ox++) {
				979	for (size_t g = 0; g < groups(); g++) {
				980	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				981	output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] =
				982	bias[g * group_output_channels() + oc];
				983	}
				984	}
				985	}
				986	}
				987	}
				988	for (size_t i = 0; i < batch_size(); i++) {
				989	for (size_t oy = 0; oy < output_height(); oy++) {
				990	for (size_t ox = 0; ox < output_width(); ox++) {
				991	for (size_t ky = 0; ky < kernel_height(); ky++) {
				992	const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
				993	if (iy < input_height()) {
				994	for (size_t kx = 0; kx < kernel_width(); kx++) {
				995	const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
				996	if (ix < input_width()) {
				997	for (size_t g = 0; g < groups(); g++) {
				998	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				999	for (size_t ic = 0; ic < group_input_channels(); ic++) {
				1000	output_ref[(((i * output_height() + oy) * output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
				1001	input[((i * input_height() + iy) * input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
				1002	kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
				1003	}
				1004	}
				1005	}
				1006	}
				1007	}
				1008	}
				1009	}
				1010	}
				1011	}
				1012	}
				1013
				1014	// Compute clamping parameters.
				1015	const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
				1016	const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
				1017
				1018	const float output_min = accumulated_min + (accumulated_max - accumulated_min) / 255.0f * float(qmin());
				1019	const float output_max = accumulated_max - (accumulated_max - accumulated_min) / 255.0f * float(255 - qmax());
				1020
				1021	// Clamp reference results.
				1022	for (float& value : output_ref) {
				1023	value = std::max(std::min(value, output_max), output_min);
				1024	}
				1025
				1026	// Create, setup, and run Convolution operator once.
				1027	ASSERT_EQ(xnn_status_success, xnn_initialize());
				1028	xnn_operator_t convolution_op = nullptr;
				1029
				1030	ASSERT_EQ(xnn_status_success,
				1031	xnn_create_convolution2d_nhwc_f32(
				1032	padding_top(), padding_right(), padding_bottom(), padding_left(),
				1033	kernel_height(), kernel_width(),
				1034	subsampling_height(), subsampling_width(),
				1035	dilation_height(), dilation_width(),
				1036	groups(), group_input_channels(), group_output_channels(),
				1037	input_pixel_stride(), output_pixel_stride(),
				1038	kernel.data(), bias.data(),
				1039	output_min, output_max,
				1040	0, &convolution_op));
				1041
				1042	// Smart pointer to automatically delete convolution_op.
				1043	std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convolution_op(convolution_op, xnn_delete_operator);
				1044
				1045	ASSERT_EQ(xnn_status_success,
				1046	xnn_setup_convolution2d_nhwc_f32(
				1047	convolution_op,
				1048	batch_size(), input_height(), input_width(),
				1049	input.data(), output.data(),
				1050	nullptr /* thread pool */));
				1051
				1052	ASSERT_EQ(xnn_status_success,
				1053	xnn_run_operator(convolution_op, nullptr /* thread pool */));
				1054
				1055	// Verify results of the first run.
				1056	for (size_t i = 0; i < batch_size(); i++) {
				1057	for (size_t y = 0; y < output_height(); y++) {
				1058	for (size_t x = 0; x < output_width(); x++) {
				1059	for (size_t g = 0; g < groups(); g++) {
				1060	for (size_t c = 0; c < group_output_channels(); c++) {
				1061	ASSERT_GE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_min)
				1062	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				1063	ASSERT_LE(output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_max)
				1064	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				1065	ASSERT_NEAR(
				1066	output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c],
				1067	output[((i * output_height() + y) * output_width() + x) * output_pixel_stride() + g * group_output_channels() + c],
				1068	1.0e-4 * std::abs(output_ref[(((i * output_height() + y) * output_width() + x) * groups() + g) * group_output_channels() + c]))
				1069	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				1070	}
				1071	}
				1072	}
				1073	}
				1074	}
				1075
				1076	// Re-generate data for the second run.
				1077	std::generate(input.begin(), input.end(), std::ref(f32rng));
				1078	std::fill(output.begin(), output.end(), nanf(""));
				1079
				1080	// Compute reference results for the second run, including clamping.
				1081	for (size_t i = 0; i < next_batch_size(); i++) {
				1082	for (size_t oy = 0; oy < next_output_height(); oy++) {
				1083	for (size_t ox = 0; ox < next_output_width(); ox++) {
				1084	for (size_t g = 0; g < groups(); g++) {
				1085	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				1086	next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] =
				1087	bias[g * group_output_channels() + oc];
				1088	}
				1089	}
				1090	}
				1091	}
				1092	}
				1093	for (size_t i = 0; i < next_batch_size(); i++) {
				1094	for (size_t oy = 0; oy < next_output_height(); oy++) {
				1095	for (size_t ox = 0; ox < next_output_width(); ox++) {
				1096	for (size_t ky = 0; ky < kernel_height(); ky++) {
				1097	const size_t iy = oy * subsampling_height() + ky * dilation_height() - padding_top();
				1098	if (iy < next_input_height()) {
				1099	for (size_t kx = 0; kx < kernel_width(); kx++) {
				1100	const size_t ix = ox * subsampling_width() + kx * dilation_width() - padding_left();
				1101	if (ix < next_input_width()) {
				1102	for (size_t g = 0; g < groups(); g++) {
				1103	for (size_t oc = 0; oc < group_output_channels(); oc++) {
				1104	for (size_t ic = 0; ic < group_input_channels(); ic++) {
				1105	next_output_ref[(((i * next_output_height() + oy) * next_output_width() + ox) * groups() + g) * group_output_channels() + oc] +=
				1106	input[((i * next_input_height() + iy) * next_input_width() + ix) * input_pixel_stride() + g * group_input_channels() + ic] *
				1107	kernel[(((g * group_output_channels() + oc) * kernel_height() + ky) * kernel_width() + kx) * group_input_channels() + ic];
				1108	}
				1109	}
				1110	}
				1111	}
				1112	}
				1113	}
				1114	}
				1115	}
				1116	}
				1117	}
				1118	for (float& value : next_output_ref) {
				1119	value = std::max(std::min(value, output_max), output_min);
				1120	}
				1121
				1122	// Setup and run Convolution operator the second time, and destroy the operator.
				1123	ASSERT_EQ(xnn_status_success,
				1124	xnn_setup_convolution2d_nhwc_f32(
				1125	convolution_op,
				1126	next_batch_size(), next_input_height(), next_input_width(),
				1127	input.data(), output.data(),
				1128	nullptr /* thread pool */));
				1129
				1130	ASSERT_EQ(xnn_status_success,
				1131	xnn_run_operator(convolution_op, nullptr /* thread pool */));
				1132
				1133	// Verify results of the second run.
				1134	for (size_t i = 0; i < next_batch_size(); i++) {
				1135	for (size_t y = 0; y < next_output_height(); y++) {
				1136	for (size_t x = 0; x < next_output_width(); x++) {
				1137	for (size_t g = 0; g < groups(); g++) {
				1138	for (size_t c = 0; c < group_output_channels(); c++) {
				1139	ASSERT_GE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_min)
				1140	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				1141	ASSERT_LE(output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c], output_max)
				1142	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				1143	ASSERT_NEAR(
				1144	next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c],
				1145	output[((i * next_output_height() + y) * next_output_width() + x) * output_pixel_stride() + g * group_output_channels() + c],
				1146	1.0e-4 * std::abs(next_output_ref[(((i * next_output_height() + y) * next_output_width() + x) * groups() + g) * group_output_channels() + c]))
				1147	<< "(x, y) = (" << x << ", " << y << "), group = " << g << ", channel = " << c;
				1148	}
				1149	}
				1150	}
				1151	}
				1152	}
				1153	}
				1154	}
				1155
				1156	private:
				1157	uint32_t padding_top_{0};
				1158	uint32_t padding_right_{0};
				1159	uint32_t padding_bottom_{0};
				1160	uint32_t padding_left_{0};
				1161	size_t input_height_{1};
				1162	size_t input_width_{1};
				1163	uint32_t groups_{1};
				1164	size_t group_input_channels_{1};
				1165	size_t input_pixel_stride_{0};
				1166	size_t group_output_channels_{1};
				1167	size_t output_pixel_stride_{0};
				1168	size_t batch_size_{1};
				1169	uint32_t kernel_height_{1};
				1170	uint32_t kernel_width_{1};
				1171	uint32_t dilation_height_{1};
				1172	uint32_t dilation_width_{1};
				1173	uint32_t subsampling_height_{1};
				1174	uint32_t subsampling_width_{1};
				1175	size_t next_input_height_{0};
				1176	size_t next_input_width_{0};
				1177	size_t next_batch_size_{0};
				1178	uint8_t qmin_{0};
				1179	uint8_t qmax_{255};
				1180	bool depthwise_layout_{false};
				1181	size_t iterations_{1};
				1182	};