Blame - src/deconvolution.c - platform/external/XNNPACK

blob: 9bce03a3628021edc74211f5af210fbbdd6770a7 [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	1	// Copyright (c) Facebook, Inc. and its affiliates.
				2	// All rights reserved.
				3	//
				4	// Copyright 2019 Google LLC
				5	//
				6	// This source code is licensed under the BSD-style license found in the
				7	// LICENSE file in the root directory of this source tree.
				8
				9	#include <assert.h>
				10	#include <stdbool.h>
				11	#include <stddef.h>
				12	#include <stdint.h>
				13	#include <string.h>
				14	#include <math.h>
				15
				16	#include <xnnpack.h>
				17	#include <xnnpack/allocator.h>
				18	#include <xnnpack/operator.h>
				19	#include <xnnpack/requantization.h>
				20	#include <xnnpack/log.h>
				21	#include <xnnpack/math.h>
				22	#include <xnnpack/pack.h>
				23	#include <xnnpack/params.h>
				24	#include <xnnpack/indirection.h>
				25
				26
				27	static inline size_t compute_output_dimension(
				28	size_t input_dimension,
				29	size_t output_padding_dimension,
				30	size_t adjustment_dimension,
				31	size_t kernel_dimension,
				32	size_t dilation_dimension,
				33	size_t stride_dimension)
				34	{
				35	const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
				36	return doz(
				37	stride_dimension * (input_dimension - 1) + adjustment_dimension + effective_kernel_dimension,
				38	output_padding_dimension);
				39	}
				40
				41	enum xnn_status xnn_create_deconvolution2d_nhwc_q8(
				42	uint32_t output_padding_top,
				43	uint32_t output_padding_right,
				44	uint32_t output_padding_bottom,
				45	uint32_t output_padding_left,
				46	uint32_t adjustment_height,
				47	uint32_t adjustment_width,
				48	uint32_t kernel_height,
				49	uint32_t kernel_width,
				50	uint32_t stride_height,
				51	uint32_t stride_width,
				52	uint32_t dilation_height,
				53	uint32_t dilation_width,
				54	uint32_t groups,
				55	size_t group_input_channels,
				56	size_t group_output_channels,
				57	size_t input_pixel_stride,
				58	size_t output_pixel_stride,
				59	uint8_t input_zero_point,
				60	float input_scale,
				61	uint8_t kernel_zero_point,
				62	float kernel_scale,
				63	const uint8_t* kernel,
				64	const int32_t* bias,
				65	uint8_t output_zero_point,
				66	float output_scale,
				67	uint8_t output_min,
				68	uint8_t output_max,
				69	uint32_t flags,
				70	xnn_operator_t* deconvolution_op_out)
				71	{
				72	xnn_operator_t deconvolution_op = NULL;
				73	enum xnn_status status = xnn_status_uninitialized;
				74
				75	if (!xnn_params.initialized) {
				76	xnn_log_error("failed to create Deconvolution operator: XNNPACK is not initialized");
				77	goto error;
				78	}
				79
				80	status = xnn_status_invalid_parameter;
				81
				82	if (kernel_width == 0 \|\| kernel_height == 0) {
				83	xnn_log_error(
				84	"failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
				85	kernel_width, kernel_height);
				86	goto error;
				87	}
				88
				89	if (stride_width == 0 \|\| stride_height == 0) {
				90	xnn_log_error(
				91	"failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
				92	stride_width, stride_height);
				93	goto error;
				94	}
				95
				96	if (dilation_width == 0 \|\| dilation_height == 0) {
				97	xnn_log_error(
				98	"failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
				99	"dilation dimensions must be non-zero",
				100	dilation_width, dilation_height);
				101	goto error;
				102	}
				103
				104	if (groups == 0) {
				105	xnn_log_error(
				106	"failed to create Deconvolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
				107	goto error;
				108	}
				109
				110	if (group_input_channels == 0) {
				111	xnn_log_error(
				112	"failed to create Deconvolution operator with %zu input channels per group: "
				113	"number of channels must be non-zero",
				114	group_input_channels);
				115	goto error;
				116	}
				117
				118	if (group_output_channels == 0) {
				119	xnn_log_error(
				120	"failed to create Deconvolution operator with %zu output channels per group: "
				121	"number of channels must be non-zero",
				122	group_output_channels);
				123	goto error;
				124	}
				125
				126	const size_t input_channels = groups * group_input_channels;
				127	if (input_pixel_stride < input_channels) {
				128	xnn_log_error(
				129	"failed to create Deconvolution operator with input pixel stride of %zu: "
				130	"stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
				131	input_pixel_stride, groups, group_input_channels);
				132	goto error;
				133	}
				134
				135	const size_t output_channels = groups * group_output_channels;
				136	if (output_pixel_stride < output_channels) {
				137	xnn_log_error(
				138	"failed to create Deconvolution operator with output pixel stride of %zu: "
				139	"stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
				140	output_pixel_stride, groups, group_output_channels);
				141	goto error;
				142	}
				143
				144	if (input_scale <= 0.0f \|\| !isnormal(input_scale)) {
				145	xnn_log_error(
				146	"failed to create Deconvolution operator with %.7g input scale: scale must be finite, normalized, and positive",
				147	input_scale);
				148	goto error;
				149	}
				150
				151	if (kernel_scale <= 0.0f \|\| !isnormal(kernel_scale)) {
				152	xnn_log_error(
				153	"failed to create Deconvolution operator with %.7g kernel scale: scale must be finite, normalized, and positive",
				154	kernel_scale);
				155	goto error;
				156	}
				157
				158	if (output_scale <= 0.0f \|\| !isnormal(output_scale)) {
				159	xnn_log_error(
				160	"failed to create Deconvolution operator with %.7g output scale: scale must be finite, normalized, and positive",
				161	output_scale);
				162	goto error;
				163	}
				164
				165	if (output_min >= output_max) {
				166	xnn_log_error(
				167	"failed to create Deconvolution operator with [%" PRIu8 ", %" PRIu8 "] output range: "
				168	"range min must be below range max",
				169	output_min, output_max);
				170	goto error;
				171	}
				172
				173	status = xnn_status_unsupported_parameter;
				174
				175	const float deconvolution_scale = input_scale * kernel_scale / output_scale;
				176	if (deconvolution_scale >= 1.0f) {
				177	xnn_log_error(
				178	"failed to create Deconvolution operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
				179	"Deconvolution operator scale %.7g is greater or equal to 1.0",
				180	input_scale, kernel_scale, output_scale, deconvolution_scale);
				181	goto error;
				182	}
				183
				184	status = xnn_status_out_of_memory;
				185
				186	deconvolution_op = xnn_allocate_zero_memory(sizeof(struct xnn_operator));
				187	if (deconvolution_op == NULL) {
				188	xnn_log_error("failed to allocate %zu bytes for Deconvolution operator descriptor", sizeof(struct xnn_operator));
				189	goto error;
				190	}
				191
				192	const uint32_t mr = xnn_params.q8.gemm.mr;
				193	const uint32_t nr = xnn_params.q8.gemm.nr;
				194	const uint32_t kr = UINT32_C(1) << xnn_params.q8.gemm.log2_kr;
				195	const xnn_igemm_ukernel_function ukernel_function = xnn_params.q8.gemm.igemm;
				196
				197	const uint32_t n_stride = round_up(group_output_channels, nr);
				198	const uint32_t k_stride = round_up_po2(group_input_channels, kr);
				199	const uint32_t kernel_size = kernel_height * kernel_width;
				200	enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_igemm;
				201	size_t packed_group_weights_size = (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride;
				202	if (max(stride_height, stride_width) > 1 && max(dilation_height, dilation_width) == 1 && stride_width <= kernel_width && stride_height <= kernel_height) {
				203	ukernel_type = xnn_ukernel_type_subconv2d;
				204	const size_t subkernels = stride_height * stride_width;
				205	packed_group_weights_size = n_stride *
				206	(sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t) * subkernels);
				207
				208	const size_t subconvolution_buffer_size = sizeof(struct subconvolution_params) * subkernels;
				209	deconvolution_op->subconvolution_buffer = xnn_allocate_zero_memory(subconvolution_buffer_size);
				210	if (deconvolution_op->subconvolution_buffer == NULL) {
				211	xnn_log_error("failed to allocate %zu bytes for subconvolution buffer", subconvolution_buffer_size);
				212	goto error;
				213	}
				214
				215	struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
				216	for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
				217	for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
				218	const size_t subkernel_height = divide_round_up(kernel_height - offset_y, stride_height);
				219	const size_t subkernel_width = divide_round_up(kernel_width - offset_x, stride_width);
				220	const size_t subkernel_size = subkernel_height * subkernel_width;
				221
				222	subconvolution_params->indirection_x_stride = sizeof(void) subkernel_size;
				223	subconvolution_params->w_stride = sizeof(int32_t) + k_stride * subkernel_size * sizeof(uint8_t);
				224	subconvolution_params++;
				225	}
				226	}
				227	}
				228	deconvolution_op->packed_weights = xnn_allocate_memory(packed_group_weights_size * groups);
				229	if (deconvolution_op->packed_weights == NULL) {
				230	xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
				231	goto error;
				232	}
				233	memset(deconvolution_op->packed_weights, kernel_zero_point, packed_group_weights_size * groups);
				234
				235	switch (ukernel_type) {
				236	case xnn_ukernel_type_igemm:
				237	xnn_pack_q8_conv_goki_w(
				238	groups, group_output_channels, kernel_size, group_input_channels,
				239	nr, kr,
				240	input_zero_point, kernel_zero_point,
				241	kernel, bias, deconvolution_op->packed_weights);
				242	break;
				243	case xnn_ukernel_type_subconv2d:
				244	xnn_pack_q8_deconv_goki_w(
				245	groups, group_output_channels, kernel_height, kernel_width, group_input_channels,
				246	stride_height, stride_width,
				247	nr, kr,
				248	input_zero_point, kernel_zero_point,
				249	kernel, bias, deconvolution_op->packed_weights, deconvolution_op->subconvolution_buffer);
				250	break;
				251	default:
				252	XNN_UNREACHABLE;
				253	}
				254
				255	size_t zero_size = sizeof(uint8_t) * k_stride + XNN_EXTRA_BYTES;
				256	void* zero_buffer = xnn_allocate_memory(zero_size);
				257	if (zero_buffer == NULL) {
				258	xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
				259	goto error;
				260	}
				261	memset(zero_buffer, input_zero_point, zero_size);
				262	deconvolution_op->zero_buffer = zero_buffer;
				263
				264	deconvolution_op->padding_top = output_padding_top;
				265	deconvolution_op->padding_right = output_padding_right;
				266	deconvolution_op->padding_bottom = output_padding_bottom;
				267	deconvolution_op->padding_left = output_padding_left;
				268	deconvolution_op->adjustment_height = adjustment_height;
				269	deconvolution_op->adjustment_width = adjustment_width;
				270
				271	deconvolution_op->kernel_height = kernel_height;
				272	deconvolution_op->kernel_width = kernel_width;
				273	deconvolution_op->stride_height = stride_height;
				274	deconvolution_op->stride_width = stride_width;
				275	deconvolution_op->dilation_height = dilation_height;
				276	deconvolution_op->dilation_width = dilation_width;
				277	deconvolution_op->groups = groups;
				278	deconvolution_op->group_input_channels = group_input_channels;
				279	deconvolution_op->group_output_channels = group_output_channels;
				280	deconvolution_op->input_pixel_stride = input_pixel_stride;
				281	deconvolution_op->output_pixel_stride = output_pixel_stride;
				282
				283	deconvolution_op->kernel_zero_point = kernel_zero_point;
				284
				285	deconvolution_op->q8_gemm_params =
				286	xnn_compute_q8_gemm_params(
				287	input_zero_point, kernel_zero_point,
				288	deconvolution_scale, output_zero_point, output_min, output_max);
				289
				290	deconvolution_op->type = xnn_operator_type_deconvolution_q8;
				291	deconvolution_op->ukernel.type = ukernel_type;
				292	deconvolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
				293	.default_function = ukernel_function,
				294	.mr = mr,
				295	.nr = nr,
				296	.kr = kr,
				297	};
				298
				299	deconvolution_op->state = xnn_run_state_invalid;
				300
				301	*deconvolution_op_out = deconvolution_op;
				302	return xnn_status_success;
				303
				304	error:
				305	xnn_delete_operator(deconvolution_op);
				306	return status;
				307	}
				308
				309	enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
				310	uint32_t output_padding_top,
				311	uint32_t output_padding_right,
				312	uint32_t output_padding_bottom,
				313	uint32_t output_padding_left,
				314	uint32_t adjustment_height,
				315	uint32_t adjustment_width,
				316	uint32_t kernel_height,
				317	uint32_t kernel_width,
				318	uint32_t stride_height,
				319	uint32_t stride_width,
				320	uint32_t dilation_height,
				321	uint32_t dilation_width,
				322	uint32_t groups,
				323	size_t group_input_channels,
				324	size_t group_output_channels,
				325	size_t input_pixel_stride,
				326	size_t output_pixel_stride,
				327	const float* kernel,
				328	const float* bias,
				329	float output_min,
				330	float output_max,
				331	uint32_t flags,
				332	xnn_operator_t* deconvolution_op_out)
				333	{
				334	xnn_operator_t deconvolution_op = NULL;
				335	enum xnn_status status = xnn_status_uninitialized;
				336
				337	if (!xnn_params.initialized) {
				338	xnn_log_error("failed to create Deconvolution operator: XNNPACK is not initialized");
				339	goto error;
				340	}
				341
				342	status = xnn_status_invalid_parameter;
				343
				344	if (kernel_width == 0 \|\| kernel_height == 0) {
				345	xnn_log_error(
				346	"failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
				347	kernel_width, kernel_height);
				348	goto error;
				349	}
				350
				351	if (stride_width == 0 \|\| stride_height == 0) {
				352	xnn_log_error(
				353	"failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
				354	stride_width, stride_height);
				355	goto error;
				356	}
				357
				358	if (dilation_width == 0 \|\| dilation_height == 0) {
				359	xnn_log_error(
				360	"failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
				361	"dilation dimensions must be non-zero",
				362	dilation_width, dilation_height);
				363	goto error;
				364	}
				365
				366	if (groups == 0) {
				367	xnn_log_error(
				368	"failed to create Deconvolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
				369	goto error;
				370	}
				371
				372	if (group_input_channels == 0) {
				373	xnn_log_error(
				374	"failed to create Deconvolution operator with %zu input channels per group: "
				375	"number of channels must be non-zero",
				376	group_input_channels);
				377	goto error;
				378	}
				379
				380	if (group_output_channels == 0) {
				381	xnn_log_error(
				382	"failed to create Deconvolution operator with %zu output channels per group: "
				383	"number of channels must be non-zero",
				384	group_output_channels);
				385	goto error;
				386	}
				387
				388	const size_t input_channels = groups * group_input_channels;
				389	if (input_pixel_stride < input_channels) {
				390	xnn_log_error(
				391	"failed to create Deconvolution operator with input pixel stride of %zu: "
				392	"stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
				393	input_pixel_stride, groups, group_input_channels);
				394	goto error;
				395	}
				396
				397	const size_t output_channels = groups * group_output_channels;
				398	if (output_pixel_stride < output_channels) {
				399	xnn_log_error(
				400	"failed to create Deconvolution operator with output pixel stride of %zu: "
				401	"stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
				402	output_pixel_stride, groups, group_output_channels);
				403	goto error;
				404	}
				405
				406	if (isnan(output_min)) {
				407	xnn_log_error(
				408	"failed to create Deconvolution operator with NaN output lower bound: lower bound must be non-NaN");
				409	goto error;
				410	}
				411
				412	if (isnan(output_max)) {
				413	xnn_log_error(
				414	"failed to create Deconvolution operator with NaN output upper bound: upper bound must be non-NaN");
				415	goto error;
				416	}
				417
				418	if (output_min >= output_max) {
				419	xnn_log_error(
				420	"failed to create Deconvolution operator with [%.7g, %.7g] output range: "
				421	"lower bound must be below upper bound",
				422	output_min, output_max);
				423	goto error;
				424	}
				425
				426	status = xnn_status_out_of_memory;
				427
				428	deconvolution_op = xnn_allocate_zero_memory(sizeof(struct xnn_operator));
				429	if (deconvolution_op == NULL) {
				430	xnn_log_error("failed to allocate %zu bytes for Deconvolution operator descriptor", sizeof(struct xnn_operator));
				431	goto error;
				432	}
				433
				434	uint32_t mr = xnn_params.f32.gemm.mr;
				435	uint32_t nr = xnn_params.f32.gemm.nr;
				436	uint32_t kr = UINT32_C(1) << xnn_params.f32.gemm.log2_kr;
				437	uint32_t sr = UINT32_C(1) << xnn_params.f32.gemm.log2_sr;
				438	xnn_igemm_ukernel_function ukernel_function = xnn_params.f32.gemm.igemm;
				439	if (nr > group_output_channels) {
				440	// Default micro-kernel is suboptimal. Try to find a better micro-kernel.
				441	if (xnn_params.f32.gemm2.igemm != NULL) {
				442	mr = xnn_params.f32.gemm2.mr;
				443	nr = xnn_params.f32.gemm2.nr;
				444	kr = UINT32_C(1) << xnn_params.f32.gemm2.log2_kr;
				445	sr = UINT32_C(1) << xnn_params.f32.gemm2.log2_sr;
				446	ukernel_function = xnn_params.f32.gemm2.igemm;
				447	}
				448	}
				449
				450	const uint32_t n_stride = round_up(group_output_channels, nr);
				451	const uint32_t k_stride = round_up_po2(group_input_channels, kr);
				452	const uint32_t kernel_size = kernel_height * kernel_width;
				453	enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_igemm;
				454	size_t packed_group_weights_size = (sizeof(float) * kernel_size * k_stride + sizeof(float)) * n_stride;
				455	if (max(stride_height, stride_width) > 1 && max(dilation_height, dilation_width) == 1 && stride_width <= kernel_width && stride_height <= kernel_height) {
				456	ukernel_type = xnn_ukernel_type_subconv2d;
				457	const size_t subkernels = stride_height * stride_width;
				458	packed_group_weights_size = n_stride *
				459	(sizeof(float) * kernel_size * k_stride + sizeof(float) * subkernels);
				460
				461	const size_t subconvolution_buffer_size = sizeof(struct subconvolution_params) * subkernels;
				462	deconvolution_op->subconvolution_buffer = xnn_allocate_zero_memory(subconvolution_buffer_size);
				463	if (deconvolution_op->subconvolution_buffer == NULL) {
				464	xnn_log_error("failed to allocate %zu bytes for subconvolution buffer", subconvolution_buffer_size);
				465	goto error;
				466	}
				467
				468	struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
				469	for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
				470	for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
				471	const size_t subkernel_height = divide_round_up(kernel_height - offset_y, stride_height);
				472	const size_t subkernel_width = divide_round_up(kernel_width - offset_x, stride_width);
				473	const size_t subkernel_size = subkernel_height * subkernel_width;
				474
				475	subconvolution_params->indirection_x_stride = sizeof(void) subkernel_size;
				476	subconvolution_params->w_stride = sizeof(float) + k_stride * subkernel_size * sizeof(float);
				477	subconvolution_params++;
				478	}
				479	}
				480	}
				481	deconvolution_op->packed_weights = xnn_allocate_memory(packed_group_weights_size * groups);
				482	if (deconvolution_op->packed_weights == NULL) {
				483	xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
				484	goto error;
				485	}
				486	memset(deconvolution_op->packed_weights, 0, packed_group_weights_size * groups);
				487
				488	switch (ukernel_type) {
				489	case xnn_ukernel_type_igemm:
				490	xnn_pack_f32_conv_goki_w(
				491	groups, group_output_channels, kernel_size, group_input_channels,
				492	nr, kr, sr,
				493	kernel, bias, deconvolution_op->packed_weights);
				494	break;
				495	case xnn_ukernel_type_subconv2d:
				496	xnn_pack_f32_deconv_goki_w(
				497	groups, group_output_channels, kernel_height, kernel_width, group_input_channels,
				498	stride_height, stride_width,
				499	nr, kr,
				500	kernel, bias, deconvolution_op->packed_weights, deconvolution_op->subconvolution_buffer);
				501	break;
				502	default:
				503	XNN_UNREACHABLE;
				504	}
				505
				506	const size_t zero_size = k_stride * sizeof(float) + XNN_EXTRA_BYTES;
				507	void* zero_buffer = xnn_allocate_zero_memory(zero_size);
				508	if (zero_buffer == NULL) {
				509	xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
				510	goto error;
				511	}
				512	deconvolution_op->zero_buffer = zero_buffer;
				513
				514	deconvolution_op->padding_top = output_padding_top;
				515	deconvolution_op->padding_right = output_padding_right;
				516	deconvolution_op->padding_bottom = output_padding_bottom;
				517	deconvolution_op->padding_left = output_padding_left;
				518	deconvolution_op->adjustment_height = adjustment_height;
				519	deconvolution_op->adjustment_width = adjustment_width;
				520
				521	deconvolution_op->kernel_height = kernel_height;
				522	deconvolution_op->kernel_width = kernel_width;
				523	deconvolution_op->stride_height = stride_height;
				524	deconvolution_op->stride_width = stride_width;
				525	deconvolution_op->dilation_height = dilation_height;
				526	deconvolution_op->dilation_width = dilation_width;
				527	deconvolution_op->groups = groups;
				528	deconvolution_op->group_input_channels = group_input_channels;
				529	deconvolution_op->group_output_channels = group_output_channels;
				530	deconvolution_op->input_pixel_stride = input_pixel_stride;
				531	deconvolution_op->output_pixel_stride = output_pixel_stride;
				532
				533	deconvolution_op->f32_output_params = xnn_compute_f32_output_params(output_min, output_max);
				534
				535	deconvolution_op->type = xnn_operator_type_deconvolution_f32;
				536	deconvolution_op->ukernel.type = ukernel_type;
				537	deconvolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
				538	.default_function = ukernel_function,
				539	.mr = mr,
				540	.nr = nr,
				541	.kr = kr,
				542	};
				543
				544	deconvolution_op->state = xnn_run_state_invalid;
				545
				546	*deconvolution_op_out = deconvolution_op;
				547	return xnn_status_success;
				548
				549	error:
				550	xnn_delete_operator(deconvolution_op);
				551	return status;
				552	}
				553
				554	static enum xnn_status setup_conv_path(
				555	xnn_operator_t deconvolution_op,
				556	size_t batch_size,
				557	size_t input_height,
				558	size_t input_width,
				559	const void* input,
				560	size_t output_height,
				561	size_t output_width,
				562	void* output,
				563	uint32_t log2_input_element_size,
				564	uint32_t log2_filter_element_size,
				565	uint32_t bias_element_size,
				566	uint32_t log2_output_element_size,
				567	const void* params,
				568	size_t num_threads)
				569	{
				570	assert(deconvolution_op->ukernel.type == xnn_ukernel_type_igemm);
				571
				572	const size_t kernel_height = deconvolution_op->kernel_height;
				573	const size_t kernel_width = deconvolution_op->kernel_width;
				574	const size_t kernel_size = kernel_height * kernel_width;
				575
				576	const size_t groups = deconvolution_op->groups;
				577	const size_t output_size = output_height * output_width;
				578	const size_t mr = deconvolution_op->ukernel.igemm.mr;
				579	const size_t tiled_output_size = round_up(output_size, mr);
				580	const size_t indirection_buffer_size = sizeof(void) kernel_size * tiled_output_size;
				581
				582	if (input_height != deconvolution_op->last_input_height \|\|
				583	input_width != deconvolution_op->last_input_width)
				584	{
				585	const void indirection_buffer = (const void) realloc(deconvolution_op->indirection_buffer, indirection_buffer_size);
				586	if (indirection_buffer == NULL) {
				587	xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
				588	return xnn_status_out_of_memory;
				589	}
				590	deconvolution_op->indirection_buffer = indirection_buffer;
				591	deconvolution_op->last_input = input;
				592	deconvolution_op->last_input_height = input_height;
				593	deconvolution_op->last_input_width = input_width;
				594
				595	xnn_indirection_init_deconv2d(deconvolution_op, mr, log2_input_element_size);
				596	}
				597
				598	const size_t group_input_channels = deconvolution_op->group_input_channels;
				599	const size_t group_output_channels = deconvolution_op->group_output_channels;
				600	const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
				601	const size_t w_stride = bias_element_size +
				602	(round_up_po2(group_input_channels, deconvolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size);
				603	deconvolution_op->context.igemm = (struct igemm_context) {
				604	.ks = kernel_size,
				605	.ks_scaled = kernel_size * mr * sizeof(void*),
				606	.kc = group_input_channels << log2_input_element_size,
				607	.w_stride = w_stride,
				608	.indirect_a = deconvolution_op->indirection_buffer,
				609	.a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
				610	.zero = deconvolution_op->zero_buffer,
				611	.packed_w = deconvolution_op->packed_weights,
				612	.c = deconvolution_op->output,
				613	.cm_stride = deconvolution_op->output_pixel_stride << log2_output_element_size,
				614	.cn_stride = nr << log2_output_element_size,
				615	.ga_stride = group_input_channels << log2_input_element_size,
				616	.gw_stride = w_stride * round_up(group_output_channels, nr),
				617	.gc_stride = group_output_channels << log2_output_element_size,
				618	.ba_stride = input_height * input_width * deconvolution_op->input_pixel_stride << log2_input_element_size,
				619	.bc_stride = output_size * deconvolution_op->output_pixel_stride << log2_output_element_size,
				620	.log2_csize = log2_output_element_size,
				621	.ukernel = deconvolution_op->ukernel.igemm.default_function,
				622	};
				623	if (output_size == 1 && deconvolution_op->ukernel.igemm.mr1_function != NULL) {
				624	deconvolution_op->context.igemm.ukernel = deconvolution_op->ukernel.igemm.mr1_function;
				625	}
				626	memcpy(&deconvolution_op->context.igemm.params, params, sizeof(deconvolution_op->context.igemm.params));
				627
				628	size_t nc = group_output_channels;
				629	if (num_threads > 1) {
				630	const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
				631	const size_t target_tiles_per_thread = 5;
				632	const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
				633	if (max_nc < nc) {
				634	nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
				635	}
				636	}
				637	if (groups == 1) {
				638	deconvolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
				639	deconvolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_igemm;
				640	deconvolution_op->compute.range[0] = batch_size;
				641	deconvolution_op->compute.range[1] = output_size;
				642	deconvolution_op->compute.range[2] = group_output_channels;
				643	deconvolution_op->compute.tile[0] = mr;
				644	deconvolution_op->compute.tile[1] = nc;
				645	} else {
				646	deconvolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
				647	deconvolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_gigemm;
				648	deconvolution_op->compute.range[0] = batch_size;
				649	deconvolution_op->compute.range[1] = groups;
				650	deconvolution_op->compute.range[2] = output_size;
				651	deconvolution_op->compute.range[3] = group_output_channels;
				652	deconvolution_op->compute.tile[0] = mr;
				653	deconvolution_op->compute.tile[1] = nc;
				654	}
				655	deconvolution_op->state = xnn_run_state_ready;
				656	return xnn_status_success;
				657	}
				658
				659	static enum xnn_status setup_subconv2d_path(
				660	xnn_operator_t deconvolution_op,
				661	size_t batch_size,
				662	size_t input_height,
				663	size_t input_width,
				664	const void* input,
				665	size_t output_height,
				666	size_t output_width,
				667	void* output,
				668	uint32_t log2_input_element_size,
				669	uint32_t log2_filter_element_size,
				670	uint32_t bias_element_size,
				671	uint32_t log2_output_element_size,
				672	const void* params,
				673	size_t num_threads)
				674	{
				675	assert(deconvolution_op->ukernel.type == xnn_ukernel_type_subconv2d);
				676
				677	const size_t kernel_height = deconvolution_op->kernel_height;
				678	const size_t kernel_width = deconvolution_op->kernel_width;
				679	const size_t kernel_size = kernel_height * kernel_width;
				680	const size_t stride_height = deconvolution_op->stride_height;
				681	const size_t stride_width = deconvolution_op->stride_width;
				682
				683	const size_t groups = deconvolution_op->groups;
				684	const size_t output_size = output_height * output_width;
				685	const size_t mr = deconvolution_op->ukernel.igemm.mr;
				686	const size_t indirection_buffer_size =
				687	sizeof(void) kernel_size * output_height * stride_width * round_up(divide_round_up(output_width, stride_width), mr);
				688
				689	if (input_height != deconvolution_op->last_input_height \|\|
				690	input_width != deconvolution_op->last_input_width)
				691	{
				692	const void indirection_buffer = (const void) realloc(deconvolution_op->indirection_buffer, indirection_buffer_size);
				693	if (indirection_buffer == NULL) {
				694	xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
				695	return xnn_status_out_of_memory;
				696	}
				697	deconvolution_op->indirection_buffer = indirection_buffer;
				698	deconvolution_op->last_input = input;
				699	deconvolution_op->last_input_height = input_height;
				700	deconvolution_op->last_input_width = input_width;
				701
				702	xnn_indirection_init_subconv2d(deconvolution_op, mr, log2_input_element_size);
				703
				704	// Initialize subconvolution parameters which depend on output dimensions or MR.
				705	struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
				706	const size_t modulo_padding_top = deconvolution_op->padding_top % stride_height;
				707	const size_t modulo_padding_left = deconvolution_op->padding_left % stride_width;
				708	const size_t output_pixel_stride = deconvolution_op->output_pixel_stride << log2_output_element_size;
				709	for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
				710	for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
				711	const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
				712	const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
				713	subconvolution_params->scaled_kernel_size = mr * subconvolution_params->indirection_x_stride;
				714	subconvolution_params->slice_width = divide_round_up(output_width - output_x_start, stride_width);
				715	subconvolution_params->slice_height = divide_round_up(output_height - output_y_start, stride_height);
				716	subconvolution_params->output =
				717	(void) ((uintptr_t) output + ((output_y_start output_width + output_x_start) * output_pixel_stride));
				718	++subconvolution_params;
				719	}
				720	}
				721	}
				722
				723	const size_t group_input_channels = deconvolution_op->group_input_channels;
				724	const size_t group_output_channels = deconvolution_op->group_output_channels;
				725	const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
				726	const size_t w_stride = stride_height * stride_width * bias_element_size +
				727	(round_up_po2(group_input_channels, deconvolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size);
				728	deconvolution_op->context.subconv = (struct subconv_context) {
				729	.subconvolution_params = deconvolution_op->subconvolution_buffer,
				730	.kc = group_input_channels << log2_input_element_size,
				731	.a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
				732	.zero = deconvolution_op->zero_buffer,
				733	.cx_stride = stride_width * deconvolution_op->output_pixel_stride << log2_output_element_size,
				734	.cy_stride = stride_height * output_width * deconvolution_op->output_pixel_stride << log2_output_element_size,
				735	.cn_stride = nr << log2_output_element_size,
				736	.ga_stride = group_input_channels << log2_input_element_size,
				737	.gw_stride = w_stride * round_up(group_output_channels, nr),
				738	.gc_stride = group_output_channels << log2_output_element_size,
				739	.ba_stride = input_height * input_width * deconvolution_op->input_pixel_stride << log2_input_element_size,
				740	.bc_stride = output_size * deconvolution_op->output_pixel_stride << log2_output_element_size,
				741	.log2_csize = log2_output_element_size,
				742	.ukernel = deconvolution_op->ukernel.igemm.default_function,
				743	};
				744	memcpy(&deconvolution_op->context.subconv.params, params, sizeof(deconvolution_op->context.subconv.params));
				745
				746	const size_t output_height_positions = divide_round_up(output_height, stride_height);
				747	const size_t output_width_positions = divide_round_up(output_width, stride_width);
				748
				749	size_t nc = group_output_channels;
				750	if (num_threads > 1) {
				751	const size_t num_other_tiles = groups * stride_height * stride_width *
				752	output_height_positions * divide_round_up(output_width_positions, mr);
				753	const size_t target_tiles_per_thread = 5;
				754	const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
				755	if (max_nc < nc) {
				756	nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
				757	}
				758	}
				759
				760	if (groups == 1) {
				761	deconvolution_op->compute.type = xnn_parallelization_type_5d_tile_2d;
				762	deconvolution_op->compute.task_5d_tile_2d = (pthreadpool_task_5d_tile_2d_t) xnn_compute_subconv2d;
				763	deconvolution_op->compute.range[0] = batch_size;
				764	deconvolution_op->compute.range[1] = stride_height * stride_width;
				765	deconvolution_op->compute.range[2] = divide_round_up(output_height, stride_height);
				766	deconvolution_op->compute.range[3] = divide_round_up(output_width, stride_width);
				767	deconvolution_op->compute.range[4] = group_output_channels;
				768	deconvolution_op->compute.tile[0] = mr;
				769	deconvolution_op->compute.tile[1] = nc;
				770	} else {
				771	deconvolution_op->compute.type = xnn_parallelization_type_6d_tile_2d;
				772	deconvolution_op->compute.task_6d_tile_2d = (pthreadpool_task_6d_tile_2d_t) xnn_compute_gsubconv2d;
				773	deconvolution_op->compute.range[0] = batch_size;
				774	deconvolution_op->compute.range[1] = groups;
				775	deconvolution_op->compute.range[2] = stride_height * stride_width;
				776	deconvolution_op->compute.range[3] = divide_round_up(output_height, stride_height);
				777	deconvolution_op->compute.range[4] = divide_round_up(output_width, stride_width);
				778	deconvolution_op->compute.range[5] = group_output_channels;
				779	deconvolution_op->compute.tile[0] = mr;
				780	deconvolution_op->compute.tile[1] = nc;
				781	}
				782
				783	deconvolution_op->state = xnn_run_state_ready;
				784	return xnn_status_success;
				785	}
				786
				787	static enum xnn_status setup_deconvolution2d(
				788	xnn_operator_t deconvolution_op,
				789	size_t batch_size,
				790	size_t input_height,
				791	size_t input_width,
				792	const void* input,
				793	void* output,
				794	uint32_t log2_input_element_size,
				795	uint32_t log2_filter_element_size,
				796	uint32_t bias_element_size,
				797	uint32_t log2_output_element_size,
				798	const void* params,
				799	size_t num_threads)
				800	{
				801	deconvolution_op->state = xnn_run_state_invalid;
				802
				803	if (!xnn_params.initialized) {
				804	xnn_log_error("failed to setup Deconvolution operator: XNNPACK is not initialized");
				805	return xnn_status_uninitialized;
				806	}
				807
				808	if (input_width == 0 \|\| input_height == 0) {
				809	xnn_log_error(
				810	"failed to setup Deconvolution with %zux%zu input: input dimensions must be non-zero",
				811	input_width, input_height);
				812	return xnn_status_invalid_parameter;
				813	}
				814
				815	if (batch_size == 0) {
				816	deconvolution_op->state = xnn_run_state_skip;
				817	return xnn_status_success;
				818	}
				819
				820	deconvolution_op->batch_size = batch_size;
				821	deconvolution_op->input_height = input_height;
				822	deconvolution_op->input_width = input_width;
				823	deconvolution_op->input = input;
				824	deconvolution_op->output = output;
				825
				826	const size_t output_height = deconvolution_op->output_height = compute_output_dimension(
				827	input_height, deconvolution_op->padding_top + deconvolution_op->padding_bottom,
				828	deconvolution_op->adjustment_height, deconvolution_op->kernel_height, deconvolution_op->dilation_height, deconvolution_op->stride_height);
				829	const size_t output_width = deconvolution_op->output_width = compute_output_dimension(
				830	input_width, deconvolution_op->padding_left + deconvolution_op->padding_right,
				831	deconvolution_op->adjustment_width, deconvolution_op->kernel_width, deconvolution_op->dilation_width, deconvolution_op->stride_width);
				832
				833	switch (deconvolution_op->ukernel.type) {
				834	case xnn_ukernel_type_igemm:
				835	return setup_conv_path(
				836	deconvolution_op,
				837	batch_size,
				838	input_height, input_width, input,
				839	output_height, output_width, output,
				840	log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
				841	params, num_threads);
				842	case xnn_ukernel_type_subconv2d:
				843	return setup_subconv2d_path(
				844	deconvolution_op,
				845	batch_size,
				846	input_height, input_width, input,
				847	output_height, output_width, output,
				848	log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
				849	params, num_threads);
				850	default:
				851	XNN_UNREACHABLE;
				852	}
				853	}
				854
				855	enum xnn_status xnn_setup_deconvolution2d_nhwc_q8(
				856	xnn_operator_t deconvolution_op,
				857	size_t batch_size,
				858	size_t input_height,
				859	size_t input_width,
				860	const uint8_t* input,
				861	uint8_t* output,
				862	pthreadpool_t threadpool)
				863	{
				864	if (deconvolution_op->type != xnn_operator_type_deconvolution_q8) {
				865	xnn_log_error("failed to setup Deconvolution (Q8) operator: operator type mismatch");
				866	return xnn_status_invalid_parameter;
				867	}
				868
				869	return setup_deconvolution2d(
				870	deconvolution_op,
				871	batch_size, input_height, input_width,
				872	input, output,
				873	0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
				874	0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
				875	sizeof(int32_t) /* sizeof(bias element) */,
				876	0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
				877	&deconvolution_op->q8_gemm_params,
				878	pthreadpool_get_threads_count(threadpool));
				879	}
				880
				881	enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
				882	xnn_operator_t deconvolution_op,
				883	size_t batch_size,
				884	size_t input_height,
				885	size_t input_width,
				886	const float* input,
				887	float* output,
				888	pthreadpool_t threadpool)
				889	{
				890	if (deconvolution_op->type != xnn_operator_type_deconvolution_f32) {
				891	xnn_log_error("failed to setup Deconvolution (F32) operator: operator type mismatch");
				892	return xnn_status_invalid_parameter;
				893	}
				894
				895	return setup_deconvolution2d(
				896	deconvolution_op,
				897	batch_size, input_height, input_width,
				898	input, output,
				899	2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
				900	2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
				901	sizeof(float) /* sizeof(bias element) */,
				902	2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
				903	&deconvolution_op->f32_output_params,
				904	pthreadpool_get_threads_count(threadpool));
				905	}