Blame - src/deconvolution-nhwc.c - platform/external/XNNPACK

blob: 3b3b0917cf581cc8bba8210637b9477a2fae5c4e [file] [log] [blame]

XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	1	// Copyright (c) Facebook, Inc. and its affiliates.
				2	// All rights reserved.
				3	//
				4	// Copyright 2019 Google LLC
				5	//
				6	// This source code is licensed under the BSD-style license found in the
				7	// LICENSE file in the root directory of this source tree.
				8
				9	#include <assert.h>
				10	#include <stdbool.h>
				11	#include <stddef.h>
				12	#include <stdint.h>
				13	#include <string.h>
				14	#include <math.h>
				15
				16	#include <xnnpack.h>
				17	#include <xnnpack/allocator.h>
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	18	#include <xnnpack/indirection.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	19	#include <xnnpack/log.h>
				20	#include <xnnpack/math.h>
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	21	#include <xnnpack/operator.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	22	#include <xnnpack/pack.h>
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	23	#include <xnnpack/params-init.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	24	#include <xnnpack/params.h>
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	25
				26
				27	static inline size_t compute_output_dimension(
				28	size_t input_dimension,
				29	size_t output_padding_dimension,
				30	size_t adjustment_dimension,
				31	size_t kernel_dimension,
				32	size_t dilation_dimension,
				33	size_t stride_dimension)
				34	{
				35	const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
				36	return doz(
				37	stride_dimension * (input_dimension - 1) + adjustment_dimension + effective_kernel_dimension,
				38	output_padding_dimension);
				39	}
				40
				41	enum xnn_status xnn_create_deconvolution2d_nhwc_q8(
				42	uint32_t output_padding_top,
				43	uint32_t output_padding_right,
				44	uint32_t output_padding_bottom,
				45	uint32_t output_padding_left,
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	46	uint32_t kernel_height,
				47	uint32_t kernel_width,
				48	uint32_t stride_height,
				49	uint32_t stride_width,
				50	uint32_t dilation_height,
				51	uint32_t dilation_width,
				52	uint32_t groups,
				53	size_t group_input_channels,
				54	size_t group_output_channels,
				55	size_t input_pixel_stride,
				56	size_t output_pixel_stride,
				57	uint8_t input_zero_point,
				58	float input_scale,
				59	uint8_t kernel_zero_point,
				60	float kernel_scale,
				61	const uint8_t* kernel,
				62	const int32_t* bias,
				63	uint8_t output_zero_point,
				64	float output_scale,
				65	uint8_t output_min,
				66	uint8_t output_max,
				67	uint32_t flags,
				68	xnn_operator_t* deconvolution_op_out)
				69	{
				70	xnn_operator_t deconvolution_op = NULL;
				71	enum xnn_status status = xnn_status_uninitialized;
				72
				73	if (!xnn_params.initialized) {
				74	xnn_log_error("failed to create Deconvolution operator: XNNPACK is not initialized");
				75	goto error;
				76	}
				77
				78	status = xnn_status_invalid_parameter;
				79
				80	if (kernel_width == 0 \|\| kernel_height == 0) {
				81	xnn_log_error(
				82	"failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
				83	kernel_width, kernel_height);
				84	goto error;
				85	}
				86
				87	if (stride_width == 0 \|\| stride_height == 0) {
				88	xnn_log_error(
				89	"failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
				90	stride_width, stride_height);
				91	goto error;
				92	}
				93
				94	if (dilation_width == 0 \|\| dilation_height == 0) {
				95	xnn_log_error(
				96	"failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
				97	"dilation dimensions must be non-zero",
				98	dilation_width, dilation_height);
				99	goto error;
				100	}
				101
				102	if (groups == 0) {
				103	xnn_log_error(
				104	"failed to create Deconvolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
				105	goto error;
				106	}
				107
				108	if (group_input_channels == 0) {
				109	xnn_log_error(
				110	"failed to create Deconvolution operator with %zu input channels per group: "
				111	"number of channels must be non-zero",
				112	group_input_channels);
				113	goto error;
				114	}
				115
				116	if (group_output_channels == 0) {
				117	xnn_log_error(
				118	"failed to create Deconvolution operator with %zu output channels per group: "
				119	"number of channels must be non-zero",
				120	group_output_channels);
				121	goto error;
				122	}
				123
				124	const size_t input_channels = groups * group_input_channels;
				125	if (input_pixel_stride < input_channels) {
				126	xnn_log_error(
				127	"failed to create Deconvolution operator with input pixel stride of %zu: "
				128	"stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
				129	input_pixel_stride, groups, group_input_channels);
				130	goto error;
				131	}
				132
				133	const size_t output_channels = groups * group_output_channels;
				134	if (output_pixel_stride < output_channels) {
				135	xnn_log_error(
				136	"failed to create Deconvolution operator with output pixel stride of %zu: "
				137	"stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
				138	output_pixel_stride, groups, group_output_channels);
				139	goto error;
				140	}
				141
				142	if (input_scale <= 0.0f \|\| !isnormal(input_scale)) {
				143	xnn_log_error(
				144	"failed to create Deconvolution operator with %.7g input scale: scale must be finite, normalized, and positive",
				145	input_scale);
				146	goto error;
				147	}
				148
				149	if (kernel_scale <= 0.0f \|\| !isnormal(kernel_scale)) {
				150	xnn_log_error(
				151	"failed to create Deconvolution operator with %.7g kernel scale: scale must be finite, normalized, and positive",
				152	kernel_scale);
				153	goto error;
				154	}
				155
				156	if (output_scale <= 0.0f \|\| !isnormal(output_scale)) {
				157	xnn_log_error(
				158	"failed to create Deconvolution operator with %.7g output scale: scale must be finite, normalized, and positive",
				159	output_scale);
				160	goto error;
				161	}
				162
				163	if (output_min >= output_max) {
				164	xnn_log_error(
				165	"failed to create Deconvolution operator with [%" PRIu8 ", %" PRIu8 "] output range: "
				166	"range min must be below range max",
				167	output_min, output_max);
				168	goto error;
				169	}
				170
				171	status = xnn_status_unsupported_parameter;
				172
				173	const float deconvolution_scale = input_scale * kernel_scale / output_scale;
				174	if (deconvolution_scale >= 1.0f) {
				175	xnn_log_error(
				176	"failed to create Deconvolution operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
				177	"Deconvolution operator scale %.7g is greater or equal to 1.0",
				178	input_scale, kernel_scale, output_scale, deconvolution_scale);
				179	goto error;
				180	}
				181
				182	status = xnn_status_out_of_memory;
				183
Marat Dukhan	04f03be	2019-11-19 12:36:47 -0800	[diff] [blame]	184	deconvolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	185	if (deconvolution_op == NULL) {
				186	xnn_log_error("failed to allocate %zu bytes for Deconvolution operator descriptor", sizeof(struct xnn_operator));
				187	goto error;
				188	}
				189
				190	const uint32_t mr = xnn_params.q8.gemm.mr;
				191	const uint32_t nr = xnn_params.q8.gemm.nr;
				192	const uint32_t kr = UINT32_C(1) << xnn_params.q8.gemm.log2_kr;
				193	const xnn_igemm_ukernel_function ukernel_function = xnn_params.q8.gemm.igemm;
				194
				195	const uint32_t n_stride = round_up(group_output_channels, nr);
				196	const uint32_t k_stride = round_up_po2(group_input_channels, kr);
				197	const uint32_t kernel_size = kernel_height * kernel_width;
				198	enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_igemm;
				199	size_t packed_group_weights_size = (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride;
				200	if (max(stride_height, stride_width) > 1 && max(dilation_height, dilation_width) == 1 && stride_width <= kernel_width && stride_height <= kernel_height) {
				201	ukernel_type = xnn_ukernel_type_subconv2d;
				202	const size_t subkernels = stride_height * stride_width;
				203	packed_group_weights_size = n_stride *
				204	(sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t) * subkernels);
				205
				206	const size_t subconvolution_buffer_size = sizeof(struct subconvolution_params) * subkernels;
Marat Dukhan	04f03be	2019-11-19 12:36:47 -0800	[diff] [blame]	207	deconvolution_op->subconvolution_buffer = xnn_allocate_zero_simd_memory(subconvolution_buffer_size);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	208	if (deconvolution_op->subconvolution_buffer == NULL) {
				209	xnn_log_error("failed to allocate %zu bytes for subconvolution buffer", subconvolution_buffer_size);
				210	goto error;
				211	}
				212
				213	struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
				214	for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
				215	for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
				216	const size_t subkernel_height = divide_round_up(kernel_height - offset_y, stride_height);
				217	const size_t subkernel_width = divide_round_up(kernel_width - offset_x, stride_width);
				218	const size_t subkernel_size = subkernel_height * subkernel_width;
				219
				220	subconvolution_params->indirection_x_stride = sizeof(void) subkernel_size;
				221	subconvolution_params->w_stride = sizeof(int32_t) + k_stride * subkernel_size * sizeof(uint8_t);
				222	subconvolution_params++;
				223	}
				224	}
				225	}
Marat Dukhan	04f03be	2019-11-19 12:36:47 -0800	[diff] [blame]	226	deconvolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	227	if (deconvolution_op->packed_weights == NULL) {
				228	xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
				229	goto error;
				230	}
				231	memset(deconvolution_op->packed_weights, kernel_zero_point, packed_group_weights_size * groups);
				232
				233	switch (ukernel_type) {
				234	case xnn_ukernel_type_igemm:
				235	xnn_pack_q8_conv_goki_w(
				236	groups, group_output_channels, kernel_size, group_input_channels,
				237	nr, kr,
				238	input_zero_point, kernel_zero_point,
				239	kernel, bias, deconvolution_op->packed_weights);
				240	break;
				241	case xnn_ukernel_type_subconv2d:
				242	xnn_pack_q8_deconv_goki_w(
				243	groups, group_output_channels, kernel_height, kernel_width, group_input_channels,
				244	stride_height, stride_width,
				245	nr, kr,
				246	input_zero_point, kernel_zero_point,
				247	kernel, bias, deconvolution_op->packed_weights, deconvolution_op->subconvolution_buffer);
				248	break;
				249	default:
				250	XNN_UNREACHABLE;
				251	}
				252
				253	size_t zero_size = sizeof(uint8_t) * k_stride + XNN_EXTRA_BYTES;
Marat Dukhan	04f03be	2019-11-19 12:36:47 -0800	[diff] [blame]	254	void* zero_buffer = xnn_allocate_simd_memory(zero_size);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	255	if (zero_buffer == NULL) {
				256	xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
				257	goto error;
				258	}
				259	memset(zero_buffer, input_zero_point, zero_size);
				260	deconvolution_op->zero_buffer = zero_buffer;
				261
				262	deconvolution_op->padding_top = output_padding_top;
				263	deconvolution_op->padding_right = output_padding_right;
				264	deconvolution_op->padding_bottom = output_padding_bottom;
				265	deconvolution_op->padding_left = output_padding_left;
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	266
				267	deconvolution_op->kernel_height = kernel_height;
				268	deconvolution_op->kernel_width = kernel_width;
				269	deconvolution_op->stride_height = stride_height;
				270	deconvolution_op->stride_width = stride_width;
				271	deconvolution_op->dilation_height = dilation_height;
				272	deconvolution_op->dilation_width = dilation_width;
				273	deconvolution_op->groups = groups;
				274	deconvolution_op->group_input_channels = group_input_channels;
				275	deconvolution_op->group_output_channels = group_output_channels;
				276	deconvolution_op->input_pixel_stride = input_pixel_stride;
				277	deconvolution_op->output_pixel_stride = output_pixel_stride;
				278
				279	deconvolution_op->kernel_zero_point = kernel_zero_point;
				280
				281	deconvolution_op->q8_gemm_params =
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	282	xnn_init_q8_gemm_params(
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	283	input_zero_point, kernel_zero_point,
				284	deconvolution_scale, output_zero_point, output_min, output_max);
				285
Marat Dukhan	efc47b8	2019-11-18 09:25:38 -0800	[diff] [blame]	286	deconvolution_op->type = xnn_operator_type_deconvolution_nhwc_q8;
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	287	deconvolution_op->ukernel.type = ukernel_type;
				288	deconvolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
				289	.default_function = ukernel_function,
				290	.mr = mr,
				291	.nr = nr,
				292	.kr = kr,
				293	};
				294
				295	deconvolution_op->state = xnn_run_state_invalid;
				296
				297	*deconvolution_op_out = deconvolution_op;
				298	return xnn_status_success;
				299
				300	error:
				301	xnn_delete_operator(deconvolution_op);
				302	return status;
				303	}
				304
				305	enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
				306	uint32_t output_padding_top,
				307	uint32_t output_padding_right,
				308	uint32_t output_padding_bottom,
				309	uint32_t output_padding_left,
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	310	uint32_t kernel_height,
				311	uint32_t kernel_width,
				312	uint32_t stride_height,
				313	uint32_t stride_width,
				314	uint32_t dilation_height,
				315	uint32_t dilation_width,
				316	uint32_t groups,
				317	size_t group_input_channels,
				318	size_t group_output_channels,
				319	size_t input_pixel_stride,
				320	size_t output_pixel_stride,
				321	const float* kernel,
				322	const float* bias,
				323	float output_min,
				324	float output_max,
				325	uint32_t flags,
				326	xnn_operator_t* deconvolution_op_out)
				327	{
				328	xnn_operator_t deconvolution_op = NULL;
				329	enum xnn_status status = xnn_status_uninitialized;
				330
				331	if (!xnn_params.initialized) {
				332	xnn_log_error("failed to create Deconvolution operator: XNNPACK is not initialized");
				333	goto error;
				334	}
				335
				336	status = xnn_status_invalid_parameter;
				337
				338	if (kernel_width == 0 \|\| kernel_height == 0) {
				339	xnn_log_error(
				340	"failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
				341	kernel_width, kernel_height);
				342	goto error;
				343	}
				344
				345	if (stride_width == 0 \|\| stride_height == 0) {
				346	xnn_log_error(
				347	"failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
				348	stride_width, stride_height);
				349	goto error;
				350	}
				351
				352	if (dilation_width == 0 \|\| dilation_height == 0) {
				353	xnn_log_error(
				354	"failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
				355	"dilation dimensions must be non-zero",
				356	dilation_width, dilation_height);
				357	goto error;
				358	}
				359
				360	if (groups == 0) {
				361	xnn_log_error(
				362	"failed to create Deconvolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
				363	goto error;
				364	}
				365
				366	if (group_input_channels == 0) {
				367	xnn_log_error(
				368	"failed to create Deconvolution operator with %zu input channels per group: "
				369	"number of channels must be non-zero",
				370	group_input_channels);
				371	goto error;
				372	}
				373
				374	if (group_output_channels == 0) {
				375	xnn_log_error(
				376	"failed to create Deconvolution operator with %zu output channels per group: "
				377	"number of channels must be non-zero",
				378	group_output_channels);
				379	goto error;
				380	}
				381
				382	const size_t input_channels = groups * group_input_channels;
				383	if (input_pixel_stride < input_channels) {
				384	xnn_log_error(
				385	"failed to create Deconvolution operator with input pixel stride of %zu: "
				386	"stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
				387	input_pixel_stride, groups, group_input_channels);
				388	goto error;
				389	}
				390
				391	const size_t output_channels = groups * group_output_channels;
				392	if (output_pixel_stride < output_channels) {
				393	xnn_log_error(
				394	"failed to create Deconvolution operator with output pixel stride of %zu: "
				395	"stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
				396	output_pixel_stride, groups, group_output_channels);
				397	goto error;
				398	}
				399
				400	if (isnan(output_min)) {
				401	xnn_log_error(
				402	"failed to create Deconvolution operator with NaN output lower bound: lower bound must be non-NaN");
				403	goto error;
				404	}
				405
				406	if (isnan(output_max)) {
				407	xnn_log_error(
				408	"failed to create Deconvolution operator with NaN output upper bound: upper bound must be non-NaN");
				409	goto error;
				410	}
				411
				412	if (output_min >= output_max) {
				413	xnn_log_error(
				414	"failed to create Deconvolution operator with [%.7g, %.7g] output range: "
				415	"lower bound must be below upper bound",
				416	output_min, output_max);
				417	goto error;
				418	}
				419
				420	status = xnn_status_out_of_memory;
				421
Marat Dukhan	04f03be	2019-11-19 12:36:47 -0800	[diff] [blame]	422	deconvolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	423	if (deconvolution_op == NULL) {
				424	xnn_log_error("failed to allocate %zu bytes for Deconvolution operator descriptor", sizeof(struct xnn_operator));
				425	goto error;
				426	}
				427
				428	uint32_t mr = xnn_params.f32.gemm.mr;
				429	uint32_t nr = xnn_params.f32.gemm.nr;
				430	uint32_t kr = UINT32_C(1) << xnn_params.f32.gemm.log2_kr;
				431	uint32_t sr = UINT32_C(1) << xnn_params.f32.gemm.log2_sr;
				432	xnn_igemm_ukernel_function ukernel_function = xnn_params.f32.gemm.igemm;
				433	if (nr > group_output_channels) {
				434	// Default micro-kernel is suboptimal. Try to find a better micro-kernel.
				435	if (xnn_params.f32.gemm2.igemm != NULL) {
				436	mr = xnn_params.f32.gemm2.mr;
				437	nr = xnn_params.f32.gemm2.nr;
				438	kr = UINT32_C(1) << xnn_params.f32.gemm2.log2_kr;
				439	sr = UINT32_C(1) << xnn_params.f32.gemm2.log2_sr;
				440	ukernel_function = xnn_params.f32.gemm2.igemm;
				441	}
				442	}
				443
				444	const uint32_t n_stride = round_up(group_output_channels, nr);
				445	const uint32_t k_stride = round_up_po2(group_input_channels, kr);
				446	const uint32_t kernel_size = kernel_height * kernel_width;
				447	enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_igemm;
				448	size_t packed_group_weights_size = (sizeof(float) * kernel_size * k_stride + sizeof(float)) * n_stride;
				449	if (max(stride_height, stride_width) > 1 && max(dilation_height, dilation_width) == 1 && stride_width <= kernel_width && stride_height <= kernel_height) {
				450	ukernel_type = xnn_ukernel_type_subconv2d;
				451	const size_t subkernels = stride_height * stride_width;
				452	packed_group_weights_size = n_stride *
				453	(sizeof(float) * kernel_size * k_stride + sizeof(float) * subkernels);
				454
				455	const size_t subconvolution_buffer_size = sizeof(struct subconvolution_params) * subkernels;
Marat Dukhan	04f03be	2019-11-19 12:36:47 -0800	[diff] [blame]	456	deconvolution_op->subconvolution_buffer = xnn_allocate_zero_simd_memory(subconvolution_buffer_size);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	457	if (deconvolution_op->subconvolution_buffer == NULL) {
				458	xnn_log_error("failed to allocate %zu bytes for subconvolution buffer", subconvolution_buffer_size);
				459	goto error;
				460	}
				461
				462	struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
				463	for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
				464	for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
				465	const size_t subkernel_height = divide_round_up(kernel_height - offset_y, stride_height);
				466	const size_t subkernel_width = divide_round_up(kernel_width - offset_x, stride_width);
				467	const size_t subkernel_size = subkernel_height * subkernel_width;
				468
				469	subconvolution_params->indirection_x_stride = sizeof(void) subkernel_size;
				470	subconvolution_params->w_stride = sizeof(float) + k_stride * subkernel_size * sizeof(float);
				471	subconvolution_params++;
				472	}
				473	}
				474	}
Marat Dukhan	04f03be	2019-11-19 12:36:47 -0800	[diff] [blame]	475	deconvolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	476	if (deconvolution_op->packed_weights == NULL) {
				477	xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
				478	goto error;
				479	}
				480	memset(deconvolution_op->packed_weights, 0, packed_group_weights_size * groups);
				481
				482	switch (ukernel_type) {
				483	case xnn_ukernel_type_igemm:
				484	xnn_pack_f32_conv_goki_w(
				485	groups, group_output_channels, kernel_size, group_input_channels,
				486	nr, kr, sr,
				487	kernel, bias, deconvolution_op->packed_weights);
				488	break;
				489	case xnn_ukernel_type_subconv2d:
				490	xnn_pack_f32_deconv_goki_w(
				491	groups, group_output_channels, kernel_height, kernel_width, group_input_channels,
				492	stride_height, stride_width,
Marat Dukhan	c4ae7de	2019-10-25 02:06:26 -0700	[diff] [blame]	493	nr, kr, sr,
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	494	kernel, bias, deconvolution_op->packed_weights, deconvolution_op->subconvolution_buffer);
				495	break;
				496	default:
				497	XNN_UNREACHABLE;
				498	}
				499
				500	const size_t zero_size = k_stride * sizeof(float) + XNN_EXTRA_BYTES;
Marat Dukhan	04f03be	2019-11-19 12:36:47 -0800	[diff] [blame]	501	void* zero_buffer = xnn_allocate_zero_simd_memory(zero_size);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	502	if (zero_buffer == NULL) {
				503	xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
				504	goto error;
				505	}
				506	deconvolution_op->zero_buffer = zero_buffer;
				507
				508	deconvolution_op->padding_top = output_padding_top;
				509	deconvolution_op->padding_right = output_padding_right;
				510	deconvolution_op->padding_bottom = output_padding_bottom;
				511	deconvolution_op->padding_left = output_padding_left;
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	512
				513	deconvolution_op->kernel_height = kernel_height;
				514	deconvolution_op->kernel_width = kernel_width;
				515	deconvolution_op->stride_height = stride_height;
				516	deconvolution_op->stride_width = stride_width;
				517	deconvolution_op->dilation_height = dilation_height;
				518	deconvolution_op->dilation_width = dilation_width;
				519	deconvolution_op->groups = groups;
				520	deconvolution_op->group_input_channels = group_input_channels;
				521	deconvolution_op->group_output_channels = group_output_channels;
				522	deconvolution_op->input_pixel_stride = input_pixel_stride;
				523	deconvolution_op->output_pixel_stride = output_pixel_stride;
				524
Marat Dukhan	eeaa7bd	2019-10-25 17:31:25 -0700	[diff] [blame]	525	deconvolution_op->f32_output_params = xnn_init_f32_output_params(output_min, output_max);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	526
Marat Dukhan	efc47b8	2019-11-18 09:25:38 -0800	[diff] [blame]	527	deconvolution_op->type = xnn_operator_type_deconvolution_nhwc_f32;
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	528	deconvolution_op->ukernel.type = ukernel_type;
				529	deconvolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
				530	.default_function = ukernel_function,
				531	.mr = mr,
				532	.nr = nr,
				533	.kr = kr,
				534	};
				535
				536	deconvolution_op->state = xnn_run_state_invalid;
				537
				538	*deconvolution_op_out = deconvolution_op;
				539	return xnn_status_success;
				540
				541	error:
				542	xnn_delete_operator(deconvolution_op);
				543	return status;
				544	}
				545
				546	static enum xnn_status setup_conv_path(
				547	xnn_operator_t deconvolution_op,
				548	size_t batch_size,
				549	size_t input_height,
				550	size_t input_width,
				551	const void* input,
				552	size_t output_height,
				553	size_t output_width,
				554	void* output,
				555	uint32_t log2_input_element_size,
				556	uint32_t log2_filter_element_size,
				557	uint32_t bias_element_size,
				558	uint32_t log2_output_element_size,
				559	const void* params,
				560	size_t num_threads)
				561	{
				562	assert(deconvolution_op->ukernel.type == xnn_ukernel_type_igemm);
				563
				564	const size_t kernel_height = deconvolution_op->kernel_height;
				565	const size_t kernel_width = deconvolution_op->kernel_width;
				566	const size_t kernel_size = kernel_height * kernel_width;
				567
				568	const size_t groups = deconvolution_op->groups;
				569	const size_t output_size = output_height * output_width;
				570	const size_t mr = deconvolution_op->ukernel.igemm.mr;
				571	const size_t tiled_output_size = round_up(output_size, mr);
				572	const size_t indirection_buffer_size = sizeof(void) kernel_size * tiled_output_size;
				573
				574	if (input_height != deconvolution_op->last_input_height \|\|
				575	input_width != deconvolution_op->last_input_width)
				576	{
Marat Dukhan	04f03be	2019-11-19 12:36:47 -0800	[diff] [blame]	577	const void indirection_buffer = (const void) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	578	if (indirection_buffer == NULL) {
				579	xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
				580	return xnn_status_out_of_memory;
				581	}
				582	deconvolution_op->indirection_buffer = indirection_buffer;
				583	deconvolution_op->last_input = input;
				584	deconvolution_op->last_input_height = input_height;
				585	deconvolution_op->last_input_width = input_width;
				586
				587	xnn_indirection_init_deconv2d(deconvolution_op, mr, log2_input_element_size);
				588	}
				589
				590	const size_t group_input_channels = deconvolution_op->group_input_channels;
				591	const size_t group_output_channels = deconvolution_op->group_output_channels;
				592	const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
				593	const size_t w_stride = bias_element_size +
				594	(round_up_po2(group_input_channels, deconvolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size);
				595	deconvolution_op->context.igemm = (struct igemm_context) {
				596	.ks = kernel_size,
				597	.ks_scaled = kernel_size * mr * sizeof(void*),
				598	.kc = group_input_channels << log2_input_element_size,
				599	.w_stride = w_stride,
				600	.indirect_a = deconvolution_op->indirection_buffer,
				601	.a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
				602	.zero = deconvolution_op->zero_buffer,
				603	.packed_w = deconvolution_op->packed_weights,
				604	.c = deconvolution_op->output,
				605	.cm_stride = deconvolution_op->output_pixel_stride << log2_output_element_size,
				606	.cn_stride = nr << log2_output_element_size,
				607	.ga_stride = group_input_channels << log2_input_element_size,
				608	.gw_stride = w_stride * round_up(group_output_channels, nr),
				609	.gc_stride = group_output_channels << log2_output_element_size,
				610	.ba_stride = input_height * input_width * deconvolution_op->input_pixel_stride << log2_input_element_size,
				611	.bc_stride = output_size * deconvolution_op->output_pixel_stride << log2_output_element_size,
				612	.log2_csize = log2_output_element_size,
				613	.ukernel = deconvolution_op->ukernel.igemm.default_function,
				614	};
				615	if (output_size == 1 && deconvolution_op->ukernel.igemm.mr1_function != NULL) {
				616	deconvolution_op->context.igemm.ukernel = deconvolution_op->ukernel.igemm.mr1_function;
				617	}
				618	memcpy(&deconvolution_op->context.igemm.params, params, sizeof(deconvolution_op->context.igemm.params));
				619
				620	size_t nc = group_output_channels;
				621	if (num_threads > 1) {
				622	const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
				623	const size_t target_tiles_per_thread = 5;
				624	const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
				625	if (max_nc < nc) {
				626	nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
				627	}
				628	}
				629	if (groups == 1) {
				630	deconvolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
				631	deconvolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_igemm;
				632	deconvolution_op->compute.range[0] = batch_size;
				633	deconvolution_op->compute.range[1] = output_size;
				634	deconvolution_op->compute.range[2] = group_output_channels;
				635	deconvolution_op->compute.tile[0] = mr;
				636	deconvolution_op->compute.tile[1] = nc;
				637	} else {
				638	deconvolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
				639	deconvolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_gigemm;
				640	deconvolution_op->compute.range[0] = batch_size;
				641	deconvolution_op->compute.range[1] = groups;
				642	deconvolution_op->compute.range[2] = output_size;
				643	deconvolution_op->compute.range[3] = group_output_channels;
				644	deconvolution_op->compute.tile[0] = mr;
				645	deconvolution_op->compute.tile[1] = nc;
				646	}
				647	deconvolution_op->state = xnn_run_state_ready;
				648	return xnn_status_success;
				649	}
				650
				651	static enum xnn_status setup_subconv2d_path(
				652	xnn_operator_t deconvolution_op,
				653	size_t batch_size,
				654	size_t input_height,
				655	size_t input_width,
				656	const void* input,
				657	size_t output_height,
				658	size_t output_width,
				659	void* output,
				660	uint32_t log2_input_element_size,
				661	uint32_t log2_filter_element_size,
				662	uint32_t bias_element_size,
				663	uint32_t log2_output_element_size,
				664	const void* params,
				665	size_t num_threads)
				666	{
				667	assert(deconvolution_op->ukernel.type == xnn_ukernel_type_subconv2d);
				668
				669	const size_t kernel_height = deconvolution_op->kernel_height;
				670	const size_t kernel_width = deconvolution_op->kernel_width;
				671	const size_t kernel_size = kernel_height * kernel_width;
				672	const size_t stride_height = deconvolution_op->stride_height;
				673	const size_t stride_width = deconvolution_op->stride_width;
				674
				675	const size_t groups = deconvolution_op->groups;
				676	const size_t output_size = output_height * output_width;
				677	const size_t mr = deconvolution_op->ukernel.igemm.mr;
				678	const size_t indirection_buffer_size =
				679	sizeof(void) kernel_size * output_height * stride_width * round_up(divide_round_up(output_width, stride_width), mr);
				680
				681	if (input_height != deconvolution_op->last_input_height \|\|
				682	input_width != deconvolution_op->last_input_width)
				683	{
Marat Dukhan	04f03be	2019-11-19 12:36:47 -0800	[diff] [blame]	684	const void indirection_buffer = (const void) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	685	if (indirection_buffer == NULL) {
				686	xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
				687	return xnn_status_out_of_memory;
				688	}
				689	deconvolution_op->indirection_buffer = indirection_buffer;
				690	deconvolution_op->last_input = input;
				691	deconvolution_op->last_input_height = input_height;
				692	deconvolution_op->last_input_width = input_width;
				693
				694	xnn_indirection_init_subconv2d(deconvolution_op, mr, log2_input_element_size);
				695
				696	// Initialize subconvolution parameters which depend on output dimensions or MR.
				697	struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
				698	const size_t modulo_padding_top = deconvolution_op->padding_top % stride_height;
				699	const size_t modulo_padding_left = deconvolution_op->padding_left % stride_width;
				700	const size_t output_pixel_stride = deconvolution_op->output_pixel_stride << log2_output_element_size;
				701	for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
				702	for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
				703	const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
				704	const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
				705	subconvolution_params->scaled_kernel_size = mr * subconvolution_params->indirection_x_stride;
				706	subconvolution_params->slice_width = divide_round_up(output_width - output_x_start, stride_width);
				707	subconvolution_params->slice_height = divide_round_up(output_height - output_y_start, stride_height);
				708	subconvolution_params->output =
				709	(void) ((uintptr_t) output + ((output_y_start output_width + output_x_start) * output_pixel_stride));
				710	++subconvolution_params;
				711	}
				712	}
				713	}
				714
				715	const size_t group_input_channels = deconvolution_op->group_input_channels;
				716	const size_t group_output_channels = deconvolution_op->group_output_channels;
				717	const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
				718	const size_t w_stride = stride_height * stride_width * bias_element_size +
				719	(round_up_po2(group_input_channels, deconvolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size);
				720	deconvolution_op->context.subconv = (struct subconv_context) {
				721	.subconvolution_params = deconvolution_op->subconvolution_buffer,
				722	.kc = group_input_channels << log2_input_element_size,
				723	.a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
				724	.zero = deconvolution_op->zero_buffer,
				725	.cx_stride = stride_width * deconvolution_op->output_pixel_stride << log2_output_element_size,
				726	.cy_stride = stride_height * output_width * deconvolution_op->output_pixel_stride << log2_output_element_size,
				727	.cn_stride = nr << log2_output_element_size,
				728	.ga_stride = group_input_channels << log2_input_element_size,
				729	.gw_stride = w_stride * round_up(group_output_channels, nr),
				730	.gc_stride = group_output_channels << log2_output_element_size,
				731	.ba_stride = input_height * input_width * deconvolution_op->input_pixel_stride << log2_input_element_size,
				732	.bc_stride = output_size * deconvolution_op->output_pixel_stride << log2_output_element_size,
				733	.log2_csize = log2_output_element_size,
				734	.ukernel = deconvolution_op->ukernel.igemm.default_function,
				735	};
				736	memcpy(&deconvolution_op->context.subconv.params, params, sizeof(deconvolution_op->context.subconv.params));
				737
				738	const size_t output_height_positions = divide_round_up(output_height, stride_height);
				739	const size_t output_width_positions = divide_round_up(output_width, stride_width);
				740
				741	size_t nc = group_output_channels;
				742	if (num_threads > 1) {
				743	const size_t num_other_tiles = groups * stride_height * stride_width *
				744	output_height_positions * divide_round_up(output_width_positions, mr);
				745	const size_t target_tiles_per_thread = 5;
				746	const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
				747	if (max_nc < nc) {
				748	nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
				749	}
				750	}
				751
				752	if (groups == 1) {
				753	deconvolution_op->compute.type = xnn_parallelization_type_5d_tile_2d;
				754	deconvolution_op->compute.task_5d_tile_2d = (pthreadpool_task_5d_tile_2d_t) xnn_compute_subconv2d;
				755	deconvolution_op->compute.range[0] = batch_size;
				756	deconvolution_op->compute.range[1] = stride_height * stride_width;
				757	deconvolution_op->compute.range[2] = divide_round_up(output_height, stride_height);
				758	deconvolution_op->compute.range[3] = divide_round_up(output_width, stride_width);
				759	deconvolution_op->compute.range[4] = group_output_channels;
				760	deconvolution_op->compute.tile[0] = mr;
				761	deconvolution_op->compute.tile[1] = nc;
				762	} else {
				763	deconvolution_op->compute.type = xnn_parallelization_type_6d_tile_2d;
				764	deconvolution_op->compute.task_6d_tile_2d = (pthreadpool_task_6d_tile_2d_t) xnn_compute_gsubconv2d;
				765	deconvolution_op->compute.range[0] = batch_size;
				766	deconvolution_op->compute.range[1] = groups;
				767	deconvolution_op->compute.range[2] = stride_height * stride_width;
				768	deconvolution_op->compute.range[3] = divide_round_up(output_height, stride_height);
				769	deconvolution_op->compute.range[4] = divide_round_up(output_width, stride_width);
				770	deconvolution_op->compute.range[5] = group_output_channels;
				771	deconvolution_op->compute.tile[0] = mr;
				772	deconvolution_op->compute.tile[1] = nc;
				773	}
				774
				775	deconvolution_op->state = xnn_run_state_ready;
				776	return xnn_status_success;
				777	}
				778
				779	static enum xnn_status setup_deconvolution2d(
				780	xnn_operator_t deconvolution_op,
				781	size_t batch_size,
				782	size_t input_height,
				783	size_t input_width,
Marat Dukhan	1898b91	2019-11-05 12:25:18 -0800	[diff] [blame]	784	uint32_t adjustment_height,
				785	uint32_t adjustment_width,
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	786	const void* input,
				787	void* output,
				788	uint32_t log2_input_element_size,
				789	uint32_t log2_filter_element_size,
				790	uint32_t bias_element_size,
				791	uint32_t log2_output_element_size,
				792	const void* params,
				793	size_t num_threads)
				794	{
				795	deconvolution_op->state = xnn_run_state_invalid;
				796
				797	if (!xnn_params.initialized) {
				798	xnn_log_error("failed to setup Deconvolution operator: XNNPACK is not initialized");
				799	return xnn_status_uninitialized;
				800	}
				801
				802	if (input_width == 0 \|\| input_height == 0) {
				803	xnn_log_error(
				804	"failed to setup Deconvolution with %zux%zu input: input dimensions must be non-zero",
				805	input_width, input_height);
				806	return xnn_status_invalid_parameter;
				807	}
				808
Marat Dukhan	1898b91	2019-11-05 12:25:18 -0800	[diff] [blame]	809	if (adjustment_height >= deconvolution_op->stride_height) {
				810	xnn_log_error(
				811	"failed to setup Deconvolution with %" PRIu32 " height adjustment: "
				812	"height adjustment must be smaller than height stride (%" PRIu32 ")",
				813	adjustment_height, deconvolution_op->stride_height);
				814	return xnn_status_invalid_parameter;
				815	}
				816
				817	if (adjustment_width >= deconvolution_op->stride_width) {
				818	xnn_log_error(
				819	"failed to setup Deconvolution with %" PRIu32 " width adjustment: "
				820	"width adjustment must be smaller than width stride (%" PRIu32 ")",
				821	adjustment_width, deconvolution_op->stride_width);
				822	return xnn_status_invalid_parameter;
				823	}
				824
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	825	if (batch_size == 0) {
				826	deconvolution_op->state = xnn_run_state_skip;
				827	return xnn_status_success;
				828	}
				829
				830	deconvolution_op->batch_size = batch_size;
				831	deconvolution_op->input_height = input_height;
				832	deconvolution_op->input_width = input_width;
				833	deconvolution_op->input = input;
				834	deconvolution_op->output = output;
				835
				836	const size_t output_height = deconvolution_op->output_height = compute_output_dimension(
				837	input_height, deconvolution_op->padding_top + deconvolution_op->padding_bottom,
Marat Dukhan	1898b91	2019-11-05 12:25:18 -0800	[diff] [blame]	838	adjustment_height, deconvolution_op->kernel_height, deconvolution_op->dilation_height, deconvolution_op->stride_height);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	839	const size_t output_width = deconvolution_op->output_width = compute_output_dimension(
				840	input_width, deconvolution_op->padding_left + deconvolution_op->padding_right,
Marat Dukhan	1898b91	2019-11-05 12:25:18 -0800	[diff] [blame]	841	adjustment_width, deconvolution_op->kernel_width, deconvolution_op->dilation_width, deconvolution_op->stride_width);
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	842
				843	switch (deconvolution_op->ukernel.type) {
				844	case xnn_ukernel_type_igemm:
				845	return setup_conv_path(
				846	deconvolution_op,
				847	batch_size,
				848	input_height, input_width, input,
				849	output_height, output_width, output,
				850	log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
				851	params, num_threads);
				852	case xnn_ukernel_type_subconv2d:
				853	return setup_subconv2d_path(
				854	deconvolution_op,
				855	batch_size,
				856	input_height, input_width, input,
				857	output_height, output_width, output,
				858	log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
				859	params, num_threads);
				860	default:
				861	XNN_UNREACHABLE;
				862	}
				863	}
				864
				865	enum xnn_status xnn_setup_deconvolution2d_nhwc_q8(
				866	xnn_operator_t deconvolution_op,
				867	size_t batch_size,
				868	size_t input_height,
				869	size_t input_width,
Marat Dukhan	1898b91	2019-11-05 12:25:18 -0800	[diff] [blame]	870	uint32_t adjustment_height,
				871	uint32_t adjustment_width,
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	872	const uint8_t* input,
				873	uint8_t* output,
				874	pthreadpool_t threadpool)
				875	{
Marat Dukhan	efc47b8	2019-11-18 09:25:38 -0800	[diff] [blame]	876	if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_q8) {
				877	xnn_log_error("failed to setup Deconvolution (NHWC, Q8) operator: operator type mismatch");
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	878	return xnn_status_invalid_parameter;
				879	}
				880
				881	return setup_deconvolution2d(
				882	deconvolution_op,
				883	batch_size, input_height, input_width,
Marat Dukhan	1898b91	2019-11-05 12:25:18 -0800	[diff] [blame]	884	adjustment_height, adjustment_width,
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	885	input, output,
				886	0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
				887	0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
				888	sizeof(int32_t) /* sizeof(bias element) */,
				889	0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
				890	&deconvolution_op->q8_gemm_params,
				891	pthreadpool_get_threads_count(threadpool));
				892	}
				893
				894	enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
				895	xnn_operator_t deconvolution_op,
				896	size_t batch_size,
				897	size_t input_height,
				898	size_t input_width,
Marat Dukhan	1898b91	2019-11-05 12:25:18 -0800	[diff] [blame]	899	uint32_t adjustment_height,
				900	uint32_t adjustment_width,
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	901	const float* input,
				902	float* output,
				903	pthreadpool_t threadpool)
				904	{
Marat Dukhan	efc47b8	2019-11-18 09:25:38 -0800	[diff] [blame]	905	if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_f32) {
				906	xnn_log_error("failed to setup Deconvolution (NHWC, F32) operator: operator type mismatch");
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	907	return xnn_status_invalid_parameter;
				908	}
				909
				910	return setup_deconvolution2d(
				911	deconvolution_op,
				912	batch_size, input_height, input_width,
Marat Dukhan	1898b91	2019-11-05 12:25:18 -0800	[diff] [blame]	913	adjustment_height, adjustment_width,
XNNPACK Team	b455b12	2019-09-27 18:10:33 -0700	[diff] [blame]	914	input, output,
				915	2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
				916	2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
				917	sizeof(float) /* sizeof(bias element) */,
				918	2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
				919	&deconvolution_op->f32_output_params,
				920	pthreadpool_get_threads_count(threadpool));
				921	}