blob: 81de3c9e2d70d3376bc34d053c57cac3fecea7b3 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <assert.h>
10#include <stdbool.h>
11#include <stddef.h>
12#include <stdint.h>
13#include <string.h>
14#include <math.h>
15
16#include <xnnpack.h>
17#include <xnnpack/allocator.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070018#include <xnnpack/indirection.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070019#include <xnnpack/log.h>
20#include <xnnpack/math.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070021#include <xnnpack/operator.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070022#include <xnnpack/pack.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070023#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include <xnnpack/params.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070025
26
27static inline size_t compute_output_dimension(
28 size_t input_dimension,
29 size_t output_padding_dimension,
30 size_t adjustment_dimension,
31 size_t kernel_dimension,
32 size_t dilation_dimension,
33 size_t stride_dimension)
34{
35 const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
36 return doz(
37 stride_dimension * (input_dimension - 1) + adjustment_dimension + effective_kernel_dimension,
38 output_padding_dimension);
39}
40
41enum xnn_status xnn_create_deconvolution2d_nhwc_q8(
42 uint32_t output_padding_top,
43 uint32_t output_padding_right,
44 uint32_t output_padding_bottom,
45 uint32_t output_padding_left,
XNNPACK Teamb455b122019-09-27 18:10:33 -070046 uint32_t kernel_height,
47 uint32_t kernel_width,
48 uint32_t stride_height,
49 uint32_t stride_width,
50 uint32_t dilation_height,
51 uint32_t dilation_width,
52 uint32_t groups,
53 size_t group_input_channels,
54 size_t group_output_channels,
55 size_t input_pixel_stride,
56 size_t output_pixel_stride,
57 uint8_t input_zero_point,
58 float input_scale,
59 uint8_t kernel_zero_point,
60 float kernel_scale,
61 const uint8_t* kernel,
62 const int32_t* bias,
63 uint8_t output_zero_point,
64 float output_scale,
65 uint8_t output_min,
66 uint8_t output_max,
67 uint32_t flags,
68 xnn_operator_t* deconvolution_op_out)
69{
70 xnn_operator_t deconvolution_op = NULL;
71 enum xnn_status status = xnn_status_uninitialized;
72
73 if (!xnn_params.initialized) {
74 xnn_log_error("failed to create Deconvolution operator: XNNPACK is not initialized");
75 goto error;
76 }
77
78 status = xnn_status_invalid_parameter;
79
80 if (kernel_width == 0 || kernel_height == 0) {
81 xnn_log_error(
82 "failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
83 kernel_width, kernel_height);
84 goto error;
85 }
86
87 if (stride_width == 0 || stride_height == 0) {
88 xnn_log_error(
89 "failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
90 stride_width, stride_height);
91 goto error;
92 }
93
94 if (dilation_width == 0 || dilation_height == 0) {
95 xnn_log_error(
96 "failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
97 "dilation dimensions must be non-zero",
98 dilation_width, dilation_height);
99 goto error;
100 }
101
102 if (groups == 0) {
103 xnn_log_error(
104 "failed to create Deconvolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
105 goto error;
106 }
107
108 if (group_input_channels == 0) {
109 xnn_log_error(
110 "failed to create Deconvolution operator with %zu input channels per group: "
111 "number of channels must be non-zero",
112 group_input_channels);
113 goto error;
114 }
115
116 if (group_output_channels == 0) {
117 xnn_log_error(
118 "failed to create Deconvolution operator with %zu output channels per group: "
119 "number of channels must be non-zero",
120 group_output_channels);
121 goto error;
122 }
123
124 const size_t input_channels = groups * group_input_channels;
125 if (input_pixel_stride < input_channels) {
126 xnn_log_error(
127 "failed to create Deconvolution operator with input pixel stride of %zu: "
128 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
129 input_pixel_stride, groups, group_input_channels);
130 goto error;
131 }
132
133 const size_t output_channels = groups * group_output_channels;
134 if (output_pixel_stride < output_channels) {
135 xnn_log_error(
136 "failed to create Deconvolution operator with output pixel stride of %zu: "
137 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
138 output_pixel_stride, groups, group_output_channels);
139 goto error;
140 }
141
142 if (input_scale <= 0.0f || !isnormal(input_scale)) {
143 xnn_log_error(
144 "failed to create Deconvolution operator with %.7g input scale: scale must be finite, normalized, and positive",
145 input_scale);
146 goto error;
147 }
148
149 if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
150 xnn_log_error(
151 "failed to create Deconvolution operator with %.7g kernel scale: scale must be finite, normalized, and positive",
152 kernel_scale);
153 goto error;
154 }
155
156 if (output_scale <= 0.0f || !isnormal(output_scale)) {
157 xnn_log_error(
158 "failed to create Deconvolution operator with %.7g output scale: scale must be finite, normalized, and positive",
159 output_scale);
160 goto error;
161 }
162
163 if (output_min >= output_max) {
164 xnn_log_error(
165 "failed to create Deconvolution operator with [%" PRIu8 ", %" PRIu8 "] output range: "
166 "range min must be below range max",
167 output_min, output_max);
168 goto error;
169 }
170
Marat Dukhan58717032020-04-28 15:03:28 -0700171 const bool any_padding = (output_padding_left | output_padding_top | output_padding_right | output_padding_bottom) != 0;
172 if (any_padding && (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
173 xnn_log_error(
174 "failed to create Deconvolution operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
175 "TensorFlow SAME padding can't be combined with explicit padding specification",
176 output_padding_top, output_padding_left, output_padding_bottom, output_padding_right);
177 goto error;
178 }
179
XNNPACK Teamb455b122019-09-27 18:10:33 -0700180 status = xnn_status_unsupported_parameter;
181
182 const float deconvolution_scale = input_scale * kernel_scale / output_scale;
183 if (deconvolution_scale >= 1.0f) {
184 xnn_log_error(
185 "failed to create Deconvolution operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
186 "Deconvolution operator scale %.7g is greater or equal to 1.0",
187 input_scale, kernel_scale, output_scale, deconvolution_scale);
188 goto error;
189 }
190
191 status = xnn_status_out_of_memory;
192
Marat Dukhan04f03be2019-11-19 12:36:47 -0800193 deconvolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700194 if (deconvolution_op == NULL) {
195 xnn_log_error("failed to allocate %zu bytes for Deconvolution operator descriptor", sizeof(struct xnn_operator));
196 goto error;
197 }
198
199 const uint32_t mr = xnn_params.q8.gemm.mr;
200 const uint32_t nr = xnn_params.q8.gemm.nr;
201 const uint32_t kr = UINT32_C(1) << xnn_params.q8.gemm.log2_kr;
Marat Dukhanaefaef32020-04-09 07:09:34 -0700202 const struct xnn_hmp_igemm_ukernel igemm_ukernel = xnn_params.q8.gemm.minmax.igemm;
203 const struct xnn_hmp_gemm_ukernel gemm_ukernel = xnn_params.q8.gemm.minmax.gemm;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700204
205 const uint32_t n_stride = round_up(group_output_channels, nr);
206 const uint32_t k_stride = round_up_po2(group_input_channels, kr);
207 const uint32_t kernel_size = kernel_height * kernel_width;
208 enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_igemm;
209 size_t packed_group_weights_size = (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride;
210 if (max(stride_height, stride_width) > 1 && max(dilation_height, dilation_width) == 1 && stride_width <= kernel_width && stride_height <= kernel_height) {
211 ukernel_type = xnn_ukernel_type_subconv2d;
212 const size_t subkernels = stride_height * stride_width;
213 packed_group_weights_size = n_stride *
214 (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t) * subkernels);
215
216 const size_t subconvolution_buffer_size = sizeof(struct subconvolution_params) * subkernels;
Marat Dukhan9c6c09b2020-04-14 09:35:39 -0700217 deconvolution_op->subconvolution_buffer = xnn_allocate_zero_memory(subconvolution_buffer_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700218 if (deconvolution_op->subconvolution_buffer == NULL) {
219 xnn_log_error("failed to allocate %zu bytes for subconvolution buffer", subconvolution_buffer_size);
220 goto error;
221 }
222
223 struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
224 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
225 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
226 const size_t subkernel_height = divide_round_up(kernel_height - offset_y, stride_height);
227 const size_t subkernel_width = divide_round_up(kernel_width - offset_x, stride_width);
228 const size_t subkernel_size = subkernel_height * subkernel_width;
229
230 subconvolution_params->indirection_x_stride = sizeof(void*) * subkernel_size;
231 subconvolution_params->w_stride = sizeof(int32_t) + k_stride * subkernel_size * sizeof(uint8_t);
232 subconvolution_params++;
233 }
234 }
235 }
Marat Dukhan04f03be2019-11-19 12:36:47 -0800236 deconvolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700237 if (deconvolution_op->packed_weights == NULL) {
238 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
239 goto error;
240 }
241 memset(deconvolution_op->packed_weights, kernel_zero_point, packed_group_weights_size * groups);
242
243 switch (ukernel_type) {
244 case xnn_ukernel_type_igemm:
245 xnn_pack_q8_conv_goki_w(
246 groups, group_output_channels, kernel_size, group_input_channels,
247 nr, kr,
248 input_zero_point, kernel_zero_point,
249 kernel, bias, deconvolution_op->packed_weights);
250 break;
251 case xnn_ukernel_type_subconv2d:
252 xnn_pack_q8_deconv_goki_w(
253 groups, group_output_channels, kernel_height, kernel_width, group_input_channels,
254 stride_height, stride_width,
255 nr, kr,
256 input_zero_point, kernel_zero_point,
257 kernel, bias, deconvolution_op->packed_weights, deconvolution_op->subconvolution_buffer);
258 break;
259 default:
260 XNN_UNREACHABLE;
261 }
262
263 size_t zero_size = sizeof(uint8_t) * k_stride + XNN_EXTRA_BYTES;
Marat Dukhan04f03be2019-11-19 12:36:47 -0800264 void* zero_buffer = xnn_allocate_simd_memory(zero_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700265 if (zero_buffer == NULL) {
266 xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
267 goto error;
268 }
269 memset(zero_buffer, input_zero_point, zero_size);
270 deconvolution_op->zero_buffer = zero_buffer;
271
272 deconvolution_op->padding_top = output_padding_top;
273 deconvolution_op->padding_right = output_padding_right;
274 deconvolution_op->padding_bottom = output_padding_bottom;
275 deconvolution_op->padding_left = output_padding_left;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700276
277 deconvolution_op->kernel_height = kernel_height;
278 deconvolution_op->kernel_width = kernel_width;
279 deconvolution_op->stride_height = stride_height;
280 deconvolution_op->stride_width = stride_width;
281 deconvolution_op->dilation_height = dilation_height;
282 deconvolution_op->dilation_width = dilation_width;
283 deconvolution_op->groups = groups;
284 deconvolution_op->group_input_channels = group_input_channels;
285 deconvolution_op->group_output_channels = group_output_channels;
286 deconvolution_op->input_pixel_stride = input_pixel_stride;
287 deconvolution_op->output_pixel_stride = output_pixel_stride;
288
289 deconvolution_op->kernel_zero_point = kernel_zero_point;
290
291 deconvolution_op->q8_gemm_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700292 xnn_init_q8_gemm_params(
XNNPACK Teamb455b122019-09-27 18:10:33 -0700293 input_zero_point, kernel_zero_point,
294 deconvolution_scale, output_zero_point, output_min, output_max);
295
Marat Dukhanefc47b82019-11-18 09:25:38 -0800296 deconvolution_op->type = xnn_operator_type_deconvolution_nhwc_q8;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700297 deconvolution_op->ukernel.type = ukernel_type;
298 deconvolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
Marat Dukhan05702cf2020-03-26 15:41:33 -0700299 .general_case = igemm_ukernel,
300 .gemm_case = gemm_ukernel,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700301 .mr = mr,
302 .nr = nr,
303 .kr = kr,
304 };
305
Marat Dukhan58717032020-04-28 15:03:28 -0700306 if (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
307 if ((stride_height | stride_width) == 1) {
308 // Padding can be computed statically
309 const uint32_t padding_height = (kernel_height - 1) * dilation_height;
310 const uint32_t padding_width = (kernel_width - 1) * dilation_width;
311
312 const uint32_t padding_top = padding_height / 2;
313 const uint32_t padding_left = padding_width / 2;
314
315 deconvolution_op->padding_top = padding_top;
316 deconvolution_op->padding_left = padding_left;
317 deconvolution_op->padding_bottom = padding_height - padding_top;
318 deconvolution_op->padding_right = padding_width - padding_left;
319 } else {
320 deconvolution_op->flags = XNN_FLAG_TENSORFLOW_SAME_PADDING;
321 }
322 }
323
XNNPACK Teamb455b122019-09-27 18:10:33 -0700324 deconvolution_op->state = xnn_run_state_invalid;
325
326 *deconvolution_op_out = deconvolution_op;
327 return xnn_status_success;
328
329error:
330 xnn_delete_operator(deconvolution_op);
331 return status;
332}
333
334enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
335 uint32_t output_padding_top,
336 uint32_t output_padding_right,
337 uint32_t output_padding_bottom,
338 uint32_t output_padding_left,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700339 uint32_t kernel_height,
340 uint32_t kernel_width,
341 uint32_t stride_height,
342 uint32_t stride_width,
343 uint32_t dilation_height,
344 uint32_t dilation_width,
345 uint32_t groups,
346 size_t group_input_channels,
347 size_t group_output_channels,
348 size_t input_pixel_stride,
349 size_t output_pixel_stride,
350 const float* kernel,
351 const float* bias,
352 float output_min,
353 float output_max,
354 uint32_t flags,
355 xnn_operator_t* deconvolution_op_out)
356{
357 xnn_operator_t deconvolution_op = NULL;
358 enum xnn_status status = xnn_status_uninitialized;
359
360 if (!xnn_params.initialized) {
361 xnn_log_error("failed to create Deconvolution operator: XNNPACK is not initialized");
362 goto error;
363 }
364
365 status = xnn_status_invalid_parameter;
366
367 if (kernel_width == 0 || kernel_height == 0) {
368 xnn_log_error(
369 "failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
370 kernel_width, kernel_height);
371 goto error;
372 }
373
374 if (stride_width == 0 || stride_height == 0) {
375 xnn_log_error(
376 "failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
377 stride_width, stride_height);
378 goto error;
379 }
380
381 if (dilation_width == 0 || dilation_height == 0) {
382 xnn_log_error(
383 "failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
384 "dilation dimensions must be non-zero",
385 dilation_width, dilation_height);
386 goto error;
387 }
388
389 if (groups == 0) {
390 xnn_log_error(
391 "failed to create Deconvolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
392 goto error;
393 }
394
395 if (group_input_channels == 0) {
396 xnn_log_error(
397 "failed to create Deconvolution operator with %zu input channels per group: "
398 "number of channels must be non-zero",
399 group_input_channels);
400 goto error;
401 }
402
403 if (group_output_channels == 0) {
404 xnn_log_error(
405 "failed to create Deconvolution operator with %zu output channels per group: "
406 "number of channels must be non-zero",
407 group_output_channels);
408 goto error;
409 }
410
411 const size_t input_channels = groups * group_input_channels;
412 if (input_pixel_stride < input_channels) {
413 xnn_log_error(
414 "failed to create Deconvolution operator with input pixel stride of %zu: "
415 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
416 input_pixel_stride, groups, group_input_channels);
417 goto error;
418 }
419
420 const size_t output_channels = groups * group_output_channels;
421 if (output_pixel_stride < output_channels) {
422 xnn_log_error(
423 "failed to create Deconvolution operator with output pixel stride of %zu: "
424 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
425 output_pixel_stride, groups, group_output_channels);
426 goto error;
427 }
428
429 if (isnan(output_min)) {
430 xnn_log_error(
431 "failed to create Deconvolution operator with NaN output lower bound: lower bound must be non-NaN");
432 goto error;
433 }
434
435 if (isnan(output_max)) {
436 xnn_log_error(
437 "failed to create Deconvolution operator with NaN output upper bound: upper bound must be non-NaN");
438 goto error;
439 }
440
441 if (output_min >= output_max) {
442 xnn_log_error(
443 "failed to create Deconvolution operator with [%.7g, %.7g] output range: "
444 "lower bound must be below upper bound",
445 output_min, output_max);
446 goto error;
447 }
448
Marat Dukhan58717032020-04-28 15:03:28 -0700449 const bool any_padding = (output_padding_left | output_padding_top | output_padding_right | output_padding_bottom) != 0;
450 if (any_padding && (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
451 xnn_log_error(
452 "failed to create Deconvolution operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
453 "TensorFlow SAME padding can't be combined with explicit padding specification",
454 output_padding_top, output_padding_left, output_padding_bottom, output_padding_right);
455 goto error;
456 }
457
XNNPACK Teamb455b122019-09-27 18:10:33 -0700458 status = xnn_status_out_of_memory;
459
Marat Dukhan04f03be2019-11-19 12:36:47 -0800460 deconvolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700461 if (deconvolution_op == NULL) {
462 xnn_log_error("failed to allocate %zu bytes for Deconvolution operator descriptor", sizeof(struct xnn_operator));
463 goto error;
464 }
465
Marat Dukhan869c62d2020-04-09 17:17:55 -0700466 const struct gemm_parameters* gemm_params = &xnn_params.f32.gemm;
467 if (gemm_params->nr > group_output_channels) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700468 // Default micro-kernel is suboptimal. Try to find a better micro-kernel.
Marat Dukhanaefaef32020-04-09 07:09:34 -0700469 if (xnn_params.f32.gemm2.minmax.igemm.function[XNN_UARCH_DEFAULT] != NULL) {
Marat Dukhan869c62d2020-04-09 17:17:55 -0700470 gemm_params = &xnn_params.f32.gemm2;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700471 }
472 }
Marat Dukhan869c62d2020-04-09 17:17:55 -0700473 const uint32_t mr = gemm_params->mr;
474 const uint32_t nr = gemm_params->nr;
475 const uint32_t kr = UINT32_C(1) << gemm_params->log2_kr;
476 const uint32_t sr = UINT32_C(1) << gemm_params->log2_sr;
477 const struct gemm_fused_ukernels* ukernels = &gemm_params->minmax;
478 const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max);
479 if (linear_activation && gemm_params->linear.gemm.function[XNN_UARCH_DEFAULT] != NULL) {
480 ukernels = &gemm_params->linear;
481 }
482 struct xnn_hmp_igemm_ukernel igemm_ukernel = ukernels->igemm;
483 struct xnn_hmp_gemm_ukernel gemm_ukernel = ukernels->gemm;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700484
485 const uint32_t n_stride = round_up(group_output_channels, nr);
486 const uint32_t k_stride = round_up_po2(group_input_channels, kr);
487 const uint32_t kernel_size = kernel_height * kernel_width;
488 enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_igemm;
489 size_t packed_group_weights_size = (sizeof(float) * kernel_size * k_stride + sizeof(float)) * n_stride;
490 if (max(stride_height, stride_width) > 1 && max(dilation_height, dilation_width) == 1 && stride_width <= kernel_width && stride_height <= kernel_height) {
491 ukernel_type = xnn_ukernel_type_subconv2d;
492 const size_t subkernels = stride_height * stride_width;
493 packed_group_weights_size = n_stride *
494 (sizeof(float) * kernel_size * k_stride + sizeof(float) * subkernels);
495
496 const size_t subconvolution_buffer_size = sizeof(struct subconvolution_params) * subkernels;
Marat Dukhan9c6c09b2020-04-14 09:35:39 -0700497 deconvolution_op->subconvolution_buffer = xnn_allocate_zero_memory(subconvolution_buffer_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700498 if (deconvolution_op->subconvolution_buffer == NULL) {
499 xnn_log_error("failed to allocate %zu bytes for subconvolution buffer", subconvolution_buffer_size);
500 goto error;
501 }
502
503 struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
504 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
505 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
506 const size_t subkernel_height = divide_round_up(kernel_height - offset_y, stride_height);
507 const size_t subkernel_width = divide_round_up(kernel_width - offset_x, stride_width);
508 const size_t subkernel_size = subkernel_height * subkernel_width;
509
510 subconvolution_params->indirection_x_stride = sizeof(void*) * subkernel_size;
511 subconvolution_params->w_stride = sizeof(float) + k_stride * subkernel_size * sizeof(float);
512 subconvolution_params++;
513 }
514 }
515 }
Marat Dukhan04f03be2019-11-19 12:36:47 -0800516 deconvolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700517 if (deconvolution_op->packed_weights == NULL) {
518 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
519 goto error;
520 }
521 memset(deconvolution_op->packed_weights, 0, packed_group_weights_size * groups);
522
523 switch (ukernel_type) {
524 case xnn_ukernel_type_igemm:
525 xnn_pack_f32_conv_goki_w(
526 groups, group_output_channels, kernel_size, group_input_channels,
527 nr, kr, sr,
528 kernel, bias, deconvolution_op->packed_weights);
529 break;
530 case xnn_ukernel_type_subconv2d:
531 xnn_pack_f32_deconv_goki_w(
532 groups, group_output_channels, kernel_height, kernel_width, group_input_channels,
533 stride_height, stride_width,
Marat Dukhanc4ae7de2019-10-25 02:06:26 -0700534 nr, kr, sr,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700535 kernel, bias, deconvolution_op->packed_weights, deconvolution_op->subconvolution_buffer);
536 break;
537 default:
538 XNN_UNREACHABLE;
539 }
540
541 const size_t zero_size = k_stride * sizeof(float) + XNN_EXTRA_BYTES;
Marat Dukhan04f03be2019-11-19 12:36:47 -0800542 void* zero_buffer = xnn_allocate_zero_simd_memory(zero_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700543 if (zero_buffer == NULL) {
544 xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
545 goto error;
546 }
547 deconvolution_op->zero_buffer = zero_buffer;
548
549 deconvolution_op->padding_top = output_padding_top;
550 deconvolution_op->padding_right = output_padding_right;
551 deconvolution_op->padding_bottom = output_padding_bottom;
552 deconvolution_op->padding_left = output_padding_left;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700553
554 deconvolution_op->kernel_height = kernel_height;
555 deconvolution_op->kernel_width = kernel_width;
556 deconvolution_op->stride_height = stride_height;
557 deconvolution_op->stride_width = stride_width;
558 deconvolution_op->dilation_height = dilation_height;
559 deconvolution_op->dilation_width = dilation_width;
560 deconvolution_op->groups = groups;
561 deconvolution_op->group_input_channels = group_input_channels;
562 deconvolution_op->group_output_channels = group_output_channels;
563 deconvolution_op->input_pixel_stride = input_pixel_stride;
564 deconvolution_op->output_pixel_stride = output_pixel_stride;
565
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700566 deconvolution_op->f32_minmax_params = xnn_init_f32_minmax_params(output_min, output_max);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700567
Marat Dukhanefc47b82019-11-18 09:25:38 -0800568 deconvolution_op->type = xnn_operator_type_deconvolution_nhwc_f32;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700569 deconvolution_op->ukernel.type = ukernel_type;
570 deconvolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
Marat Dukhan05702cf2020-03-26 15:41:33 -0700571 .general_case = igemm_ukernel,
572 .gemm_case = gemm_ukernel,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700573 .mr = mr,
574 .nr = nr,
575 .kr = kr,
576 };
577
Marat Dukhan58717032020-04-28 15:03:28 -0700578 if (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
579 if ((stride_height | stride_width) == 1) {
580 // Padding can be computed statically
581 const uint32_t padding_height = (kernel_height - 1) * dilation_height;
582 const uint32_t padding_width = (kernel_width - 1) * dilation_width;
583
584 const uint32_t padding_top = padding_height / 2;
585 const uint32_t padding_left = padding_width / 2;
586
587 deconvolution_op->padding_top = padding_top;
588 deconvolution_op->padding_left = padding_left;
589 deconvolution_op->padding_bottom = padding_height - padding_top;
590 deconvolution_op->padding_right = padding_width - padding_left;
591 } else {
592 deconvolution_op->flags = XNN_FLAG_TENSORFLOW_SAME_PADDING;
593 }
594 }
595
XNNPACK Teamb455b122019-09-27 18:10:33 -0700596 deconvolution_op->state = xnn_run_state_invalid;
597
598 *deconvolution_op_out = deconvolution_op;
599 return xnn_status_success;
600
601error:
602 xnn_delete_operator(deconvolution_op);
603 return status;
604}
605
606static enum xnn_status setup_conv_path(
607 xnn_operator_t deconvolution_op,
608 size_t batch_size,
609 size_t input_height,
610 size_t input_width,
611 const void* input,
612 size_t output_height,
613 size_t output_width,
614 void* output,
615 uint32_t log2_input_element_size,
616 uint32_t log2_filter_element_size,
617 uint32_t bias_element_size,
618 uint32_t log2_output_element_size,
619 const void* params,
620 size_t num_threads)
621{
622 assert(deconvolution_op->ukernel.type == xnn_ukernel_type_igemm);
623
624 const size_t kernel_height = deconvolution_op->kernel_height;
625 const size_t kernel_width = deconvolution_op->kernel_width;
626 const size_t kernel_size = kernel_height * kernel_width;
627
628 const size_t groups = deconvolution_op->groups;
629 const size_t output_size = output_height * output_width;
630 const size_t mr = deconvolution_op->ukernel.igemm.mr;
631 const size_t tiled_output_size = round_up(output_size, mr);
632 const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
633
634 if (input_height != deconvolution_op->last_input_height ||
635 input_width != deconvolution_op->last_input_width)
636 {
Marat Dukhan04f03be2019-11-19 12:36:47 -0800637 const void** indirection_buffer = (const void**) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700638 if (indirection_buffer == NULL) {
639 xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
640 return xnn_status_out_of_memory;
641 }
642 deconvolution_op->indirection_buffer = indirection_buffer;
643 deconvolution_op->last_input = input;
644 deconvolution_op->last_input_height = input_height;
645 deconvolution_op->last_input_width = input_width;
646
647 xnn_indirection_init_deconv2d(deconvolution_op, mr, log2_input_element_size);
648 }
649
650 const size_t group_input_channels = deconvolution_op->group_input_channels;
651 const size_t group_output_channels = deconvolution_op->group_output_channels;
652 const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
653 const size_t w_stride = bias_element_size +
654 (round_up_po2(group_input_channels, deconvolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size);
655 deconvolution_op->context.igemm = (struct igemm_context) {
656 .ks = kernel_size,
657 .ks_scaled = kernel_size * mr * sizeof(void*),
658 .kc = group_input_channels << log2_input_element_size,
659 .w_stride = w_stride,
660 .indirect_a = deconvolution_op->indirection_buffer,
661 .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
662 .zero = deconvolution_op->zero_buffer,
663 .packed_w = deconvolution_op->packed_weights,
664 .c = deconvolution_op->output,
665 .cm_stride = deconvolution_op->output_pixel_stride << log2_output_element_size,
666 .cn_stride = nr << log2_output_element_size,
667 .ga_stride = group_input_channels << log2_input_element_size,
668 .gw_stride = w_stride * round_up(group_output_channels, nr),
669 .gc_stride = group_output_channels << log2_output_element_size,
670 .ba_stride = input_height * input_width * deconvolution_op->input_pixel_stride << log2_input_element_size,
671 .bc_stride = output_size * deconvolution_op->output_pixel_stride << log2_output_element_size,
672 .log2_csize = log2_output_element_size,
Marat Dukhan05702cf2020-03-26 15:41:33 -0700673 .ukernel = deconvolution_op->ukernel.igemm.general_case,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700674 };
Marat Dukhan05702cf2020-03-26 15:41:33 -0700675 if (output_size == 1 && deconvolution_op->ukernel.igemm.mr1_case.function[XNN_UARCH_DEFAULT] != NULL) {
676 deconvolution_op->context.igemm.ukernel = deconvolution_op->ukernel.igemm.mr1_case;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700677 }
678 memcpy(&deconvolution_op->context.igemm.params, params, sizeof(deconvolution_op->context.igemm.params));
679
680 size_t nc = group_output_channels;
681 if (num_threads > 1) {
682 const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
683 const size_t target_tiles_per_thread = 5;
684 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
685 if (max_nc < nc) {
686 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
687 }
688 }
689 if (groups == 1) {
690 deconvolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
691 deconvolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_igemm;
692 deconvolution_op->compute.range[0] = batch_size;
693 deconvolution_op->compute.range[1] = output_size;
694 deconvolution_op->compute.range[2] = group_output_channels;
695 deconvolution_op->compute.tile[0] = mr;
696 deconvolution_op->compute.tile[1] = nc;
697 } else {
698 deconvolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
Marat Dukhan49a59692020-03-06 16:58:33 -0800699 deconvolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_igemm;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700700 deconvolution_op->compute.range[0] = batch_size;
701 deconvolution_op->compute.range[1] = groups;
702 deconvolution_op->compute.range[2] = output_size;
703 deconvolution_op->compute.range[3] = group_output_channels;
704 deconvolution_op->compute.tile[0] = mr;
705 deconvolution_op->compute.tile[1] = nc;
706 }
707 deconvolution_op->state = xnn_run_state_ready;
708 return xnn_status_success;
709}
710
711static enum xnn_status setup_subconv2d_path(
712 xnn_operator_t deconvolution_op,
713 size_t batch_size,
714 size_t input_height,
715 size_t input_width,
716 const void* input,
717 size_t output_height,
718 size_t output_width,
719 void* output,
720 uint32_t log2_input_element_size,
721 uint32_t log2_filter_element_size,
722 uint32_t bias_element_size,
723 uint32_t log2_output_element_size,
724 const void* params,
Marat Dukhan29954272020-02-13 17:56:11 -0800725 size_t num_threads,
726 bool use_gemm)
XNNPACK Teamb455b122019-09-27 18:10:33 -0700727{
728 assert(deconvolution_op->ukernel.type == xnn_ukernel_type_subconv2d);
729
730 const size_t kernel_height = deconvolution_op->kernel_height;
731 const size_t kernel_width = deconvolution_op->kernel_width;
732 const size_t kernel_size = kernel_height * kernel_width;
733 const size_t stride_height = deconvolution_op->stride_height;
734 const size_t stride_width = deconvolution_op->stride_width;
735
736 const size_t groups = deconvolution_op->groups;
737 const size_t output_size = output_height * output_width;
738 const size_t mr = deconvolution_op->ukernel.igemm.mr;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700739
Marat Dukhan29954272020-02-13 17:56:11 -0800740 const size_t input_pixel_stride = deconvolution_op->input_pixel_stride << log2_input_element_size;
741 const size_t output_pixel_stride = deconvolution_op->output_pixel_stride << log2_output_element_size;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700742
Marat Dukhan29954272020-02-13 17:56:11 -0800743 const bool any_size_change =
744 input_height != deconvolution_op->last_input_height ||
745 input_width != deconvolution_op->last_input_width ||
746 output_height != deconvolution_op->last_output_height ||
747 output_width != deconvolution_op->last_output_width;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700748
Marat Dukhan29954272020-02-13 17:56:11 -0800749 if (any_size_change || output != deconvolution_op->last_output) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700750 // Initialize subconvolution parameters which depend on output dimensions or MR.
751 struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
752 const size_t modulo_padding_top = deconvolution_op->padding_top % stride_height;
753 const size_t modulo_padding_left = deconvolution_op->padding_left % stride_width;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700754 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
755 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
756 const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
757 const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
758 subconvolution_params->scaled_kernel_size = mr * subconvolution_params->indirection_x_stride;
759 subconvolution_params->slice_width = divide_round_up(output_width - output_x_start, stride_width);
760 subconvolution_params->slice_height = divide_round_up(output_height - output_y_start, stride_height);
761 subconvolution_params->output =
762 (void*) ((uintptr_t) output + ((output_y_start * output_width + output_x_start) * output_pixel_stride));
763 ++subconvolution_params;
764 }
765 }
Marat Dukhan29954272020-02-13 17:56:11 -0800766 deconvolution_op->last_output = output;
767 }
768
769 if (any_size_change) {
770 if (!use_gemm) {
771 const size_t indirection_buffer_size = sizeof(void*) *
772 kernel_size * output_height * stride_width * round_up(divide_round_up(output_width, stride_width), mr);
773
774 const void** indirection_buffer =
775 (const void**) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
776 if (indirection_buffer == NULL) {
777 xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
778 return xnn_status_out_of_memory;
779 }
780 deconvolution_op->indirection_buffer = indirection_buffer;
781 deconvolution_op->last_input = input;
782
783 xnn_indirection_init_subconv2d(deconvolution_op, mr, log2_input_element_size);
784 }
785 deconvolution_op->last_input_height = input_height;
786 deconvolution_op->last_input_width = input_width;
787 deconvolution_op->last_output_height = output_height;
788 deconvolution_op->last_output_width = output_width;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700789 }
790
791 const size_t group_input_channels = deconvolution_op->group_input_channels;
792 const size_t group_output_channels = deconvolution_op->group_output_channels;
793 const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
Marat Dukhan29954272020-02-13 17:56:11 -0800794 const uint32_t kr = deconvolution_op->ukernel.igemm.kr;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700795 const size_t w_stride = stride_height * stride_width * bias_element_size +
Marat Dukhan29954272020-02-13 17:56:11 -0800796 (round_up_po2(group_input_channels, kr) * kernel_size << log2_filter_element_size);
797 if (use_gemm) {
798 deconvolution_op->context.subgemm = (struct subgemm_context) {
799 .subconvolution_params = deconvolution_op->subconvolution_buffer,
800 .kc = group_input_channels << log2_input_element_size,
801 .a = input,
802 .ax_stride = input_pixel_stride,
803 .ay_stride = input_width * input_pixel_stride,
804 .cx_stride = stride_width * output_pixel_stride,
805 .cy_stride = stride_height * output_width * output_pixel_stride,
806 .cn_stride = nr << log2_output_element_size,
807 .ga_stride = group_input_channels << log2_input_element_size,
808 .gw_stride = w_stride * round_up(group_output_channels, nr),
809 .gc_stride = group_output_channels << log2_output_element_size,
810 .ba_stride = input_height * input_width * input_pixel_stride,
811 .bc_stride = output_size * output_pixel_stride,
812 .log2_csize = log2_output_element_size,
Marat Dukhan05702cf2020-03-26 15:41:33 -0700813 .ukernel = deconvolution_op->ukernel.igemm.gemm_case,
Marat Dukhan29954272020-02-13 17:56:11 -0800814 };
815 memcpy(&deconvolution_op->context.subgemm.params, params, sizeof(deconvolution_op->context.subgemm.params));
816 } else {
817 deconvolution_op->context.subconv = (struct subconv_context) {
818 .subconvolution_params = deconvolution_op->subconvolution_buffer,
819 .kc = group_input_channels << log2_input_element_size,
820 .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
821 .zero = deconvolution_op->zero_buffer,
822 .cx_stride = stride_width * output_pixel_stride,
823 .cy_stride = stride_height * output_width * output_pixel_stride,
824 .cn_stride = nr << log2_output_element_size,
825 .ga_stride = group_input_channels << log2_input_element_size,
826 .gw_stride = w_stride * round_up(group_output_channels, nr),
827 .gc_stride = group_output_channels << log2_output_element_size,
828 .ba_stride = input_height * input_width * input_pixel_stride,
829 .bc_stride = output_size * output_pixel_stride,
830 .log2_csize = log2_output_element_size,
Marat Dukhan05702cf2020-03-26 15:41:33 -0700831 .ukernel = deconvolution_op->ukernel.igemm.general_case,
Marat Dukhan29954272020-02-13 17:56:11 -0800832 };
833 memcpy(&deconvolution_op->context.subconv.params, params, sizeof(deconvolution_op->context.subconv.params));
834 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700835
836 const size_t output_height_positions = divide_round_up(output_height, stride_height);
837 const size_t output_width_positions = divide_round_up(output_width, stride_width);
838
839 size_t nc = group_output_channels;
840 if (num_threads > 1) {
841 const size_t num_other_tiles = groups * stride_height * stride_width *
842 output_height_positions * divide_round_up(output_width_positions, mr);
843 const size_t target_tiles_per_thread = 5;
844 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
845 if (max_nc < nc) {
846 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
847 }
848 }
849
850 if (groups == 1) {
851 deconvolution_op->compute.type = xnn_parallelization_type_5d_tile_2d;
Marat Dukhan29954272020-02-13 17:56:11 -0800852 deconvolution_op->compute.task_5d_tile_2d = use_gemm ?
853 (pthreadpool_task_5d_tile_2d_t) xnn_compute_subgemm2d : (pthreadpool_task_5d_tile_2d_t) xnn_compute_subconv2d;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700854 deconvolution_op->compute.range[0] = batch_size;
855 deconvolution_op->compute.range[1] = stride_height * stride_width;
856 deconvolution_op->compute.range[2] = divide_round_up(output_height, stride_height);
857 deconvolution_op->compute.range[3] = divide_round_up(output_width, stride_width);
858 deconvolution_op->compute.range[4] = group_output_channels;
859 deconvolution_op->compute.tile[0] = mr;
860 deconvolution_op->compute.tile[1] = nc;
861 } else {
862 deconvolution_op->compute.type = xnn_parallelization_type_6d_tile_2d;
Marat Dukhan29954272020-02-13 17:56:11 -0800863 deconvolution_op->compute.task_6d_tile_2d = use_gemm ?
Marat Dukhan49a59692020-03-06 16:58:33 -0800864 (pthreadpool_task_6d_tile_2d_t) xnn_compute_grouped_subgemm2d : (pthreadpool_task_6d_tile_2d_t) xnn_compute_grouped_subconv2d;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700865 deconvolution_op->compute.range[0] = batch_size;
866 deconvolution_op->compute.range[1] = groups;
867 deconvolution_op->compute.range[2] = stride_height * stride_width;
868 deconvolution_op->compute.range[3] = divide_round_up(output_height, stride_height);
869 deconvolution_op->compute.range[4] = divide_round_up(output_width, stride_width);
870 deconvolution_op->compute.range[5] = group_output_channels;
871 deconvolution_op->compute.tile[0] = mr;
872 deconvolution_op->compute.tile[1] = nc;
873 }
874
875 deconvolution_op->state = xnn_run_state_ready;
876 return xnn_status_success;
877}
878
879static enum xnn_status setup_deconvolution2d(
880 xnn_operator_t deconvolution_op,
881 size_t batch_size,
882 size_t input_height,
883 size_t input_width,
Marat Dukhan1898b912019-11-05 12:25:18 -0800884 uint32_t adjustment_height,
885 uint32_t adjustment_width,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700886 const void* input,
887 void* output,
888 uint32_t log2_input_element_size,
889 uint32_t log2_filter_element_size,
890 uint32_t bias_element_size,
891 uint32_t log2_output_element_size,
892 const void* params,
893 size_t num_threads)
894{
895 deconvolution_op->state = xnn_run_state_invalid;
896
897 if (!xnn_params.initialized) {
898 xnn_log_error("failed to setup Deconvolution operator: XNNPACK is not initialized");
899 return xnn_status_uninitialized;
900 }
901
902 if (input_width == 0 || input_height == 0) {
903 xnn_log_error(
904 "failed to setup Deconvolution with %zux%zu input: input dimensions must be non-zero",
905 input_width, input_height);
906 return xnn_status_invalid_parameter;
907 }
908
Marat Dukhan1898b912019-11-05 12:25:18 -0800909 if (adjustment_height >= deconvolution_op->stride_height) {
910 xnn_log_error(
911 "failed to setup Deconvolution with %" PRIu32 " height adjustment: "
912 "height adjustment must be smaller than height stride (%" PRIu32 ")",
913 adjustment_height, deconvolution_op->stride_height);
914 return xnn_status_invalid_parameter;
915 }
916
917 if (adjustment_width >= deconvolution_op->stride_width) {
918 xnn_log_error(
919 "failed to setup Deconvolution with %" PRIu32 " width adjustment: "
920 "width adjustment must be smaller than width stride (%" PRIu32 ")",
921 adjustment_width, deconvolution_op->stride_width);
922 return xnn_status_invalid_parameter;
923 }
924
XNNPACK Teamb455b122019-09-27 18:10:33 -0700925 if (batch_size == 0) {
926 deconvolution_op->state = xnn_run_state_skip;
927 return xnn_status_success;
928 }
929
930 deconvolution_op->batch_size = batch_size;
931 deconvolution_op->input_height = input_height;
932 deconvolution_op->input_width = input_width;
933 deconvolution_op->input = input;
934 deconvolution_op->output = output;
935
Marat Dukhan58717032020-04-28 15:03:28 -0700936 if (deconvolution_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
937 // Recompute padding for the input size.
938 const uint32_t dilated_kernel_height_minus_1 = (deconvolution_op->kernel_height - 1) * deconvolution_op->dilation_height;
939 const uint32_t dilated_kernel_width_minus_1 = (deconvolution_op->kernel_width - 1) * deconvolution_op->dilation_width;
940
941 const size_t total_padding_height = doz(dilated_kernel_height_minus_1, (input_height - 1) % deconvolution_op->stride_height);
942 const size_t total_padding_width = doz(dilated_kernel_width_minus_1, (input_width - 1) % deconvolution_op->stride_width);
943
944 const uint32_t padding_top = deconvolution_op->padding_top = total_padding_height / 2;
945 const uint32_t padding_left = deconvolution_op->padding_left = total_padding_width / 2;
946 deconvolution_op->padding_bottom = total_padding_height - padding_top;
947 deconvolution_op->padding_right = total_padding_width - padding_left;
948 }
949
XNNPACK Teamb455b122019-09-27 18:10:33 -0700950 const size_t output_height = deconvolution_op->output_height = compute_output_dimension(
951 input_height, deconvolution_op->padding_top + deconvolution_op->padding_bottom,
Marat Dukhan1898b912019-11-05 12:25:18 -0800952 adjustment_height, deconvolution_op->kernel_height, deconvolution_op->dilation_height, deconvolution_op->stride_height);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700953 const size_t output_width = deconvolution_op->output_width = compute_output_dimension(
954 input_width, deconvolution_op->padding_left + deconvolution_op->padding_right,
Marat Dukhan1898b912019-11-05 12:25:18 -0800955 adjustment_width, deconvolution_op->kernel_width, deconvolution_op->dilation_width, deconvolution_op->stride_width);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700956
957 switch (deconvolution_op->ukernel.type) {
958 case xnn_ukernel_type_igemm:
959 return setup_conv_path(
960 deconvolution_op,
961 batch_size,
962 input_height, input_width, input,
963 output_height, output_width, output,
964 log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
965 params, num_threads);
966 case xnn_ukernel_type_subconv2d:
Marat Dukhan29954272020-02-13 17:56:11 -0800967 {
968 const bool no_padding = (deconvolution_op->padding_top | deconvolution_op->padding_right | deconvolution_op->padding_bottom | deconvolution_op->padding_left) == 0;
969 const bool no_adjustment = (adjustment_height | adjustment_width) == 0;
970 const bool use_gemm = no_padding && no_adjustment &&
971 deconvolution_op->kernel_height == deconvolution_op->stride_height &&
972 deconvolution_op->kernel_width == deconvolution_op->stride_width &&
Marat Dukhan05702cf2020-03-26 15:41:33 -0700973 deconvolution_op->ukernel.igemm.gemm_case.function[XNN_UARCH_DEFAULT] != NULL;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700974 return setup_subconv2d_path(
975 deconvolution_op,
976 batch_size,
977 input_height, input_width, input,
978 output_height, output_width, output,
979 log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
Marat Dukhan29954272020-02-13 17:56:11 -0800980 params, num_threads, use_gemm);
981 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700982 default:
983 XNN_UNREACHABLE;
984 }
985}
986
987enum xnn_status xnn_setup_deconvolution2d_nhwc_q8(
988 xnn_operator_t deconvolution_op,
989 size_t batch_size,
990 size_t input_height,
991 size_t input_width,
Marat Dukhan1898b912019-11-05 12:25:18 -0800992 uint32_t adjustment_height,
993 uint32_t adjustment_width,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700994 const uint8_t* input,
995 uint8_t* output,
996 pthreadpool_t threadpool)
997{
Marat Dukhanefc47b82019-11-18 09:25:38 -0800998 if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_q8) {
999 xnn_log_error("failed to setup Deconvolution (NHWC, Q8) operator: operator type mismatch");
XNNPACK Teamb455b122019-09-27 18:10:33 -07001000 return xnn_status_invalid_parameter;
1001 }
1002
1003 return setup_deconvolution2d(
1004 deconvolution_op,
1005 batch_size, input_height, input_width,
Marat Dukhan1898b912019-11-05 12:25:18 -08001006 adjustment_height, adjustment_width,
XNNPACK Teamb455b122019-09-27 18:10:33 -07001007 input, output,
1008 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
1009 0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
1010 sizeof(int32_t) /* sizeof(bias element) */,
1011 0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
1012 &deconvolution_op->q8_gemm_params,
1013 pthreadpool_get_threads_count(threadpool));
1014}
1015
1016enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
1017 xnn_operator_t deconvolution_op,
1018 size_t batch_size,
1019 size_t input_height,
1020 size_t input_width,
Marat Dukhan1898b912019-11-05 12:25:18 -08001021 uint32_t adjustment_height,
1022 uint32_t adjustment_width,
XNNPACK Teamb455b122019-09-27 18:10:33 -07001023 const float* input,
1024 float* output,
1025 pthreadpool_t threadpool)
1026{
Marat Dukhanefc47b82019-11-18 09:25:38 -08001027 if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_f32) {
1028 xnn_log_error("failed to setup Deconvolution (NHWC, F32) operator: operator type mismatch");
XNNPACK Teamb455b122019-09-27 18:10:33 -07001029 return xnn_status_invalid_parameter;
1030 }
1031
1032 return setup_deconvolution2d(
1033 deconvolution_op,
1034 batch_size, input_height, input_width,
Marat Dukhan1898b912019-11-05 12:25:18 -08001035 adjustment_height, adjustment_width,
XNNPACK Teamb455b122019-09-27 18:10:33 -07001036 input, output,
1037 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1038 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1039 sizeof(float) /* sizeof(bias element) */,
1040 2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
Marat Dukhaneb09a6b2020-04-08 17:34:32 -07001041 &deconvolution_op->f32_minmax_params,
XNNPACK Teamb455b122019-09-27 18:10:33 -07001042 pthreadpool_get_threads_count(threadpool));
1043}