blob: 3b3b0917cf581cc8bba8210637b9477a2fae5c4e [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <assert.h>
10#include <stdbool.h>
11#include <stddef.h>
12#include <stdint.h>
13#include <string.h>
14#include <math.h>
15
16#include <xnnpack.h>
17#include <xnnpack/allocator.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070018#include <xnnpack/indirection.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070019#include <xnnpack/log.h>
20#include <xnnpack/math.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070021#include <xnnpack/operator.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070022#include <xnnpack/pack.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070023#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070024#include <xnnpack/params.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070025
26
27static inline size_t compute_output_dimension(
28 size_t input_dimension,
29 size_t output_padding_dimension,
30 size_t adjustment_dimension,
31 size_t kernel_dimension,
32 size_t dilation_dimension,
33 size_t stride_dimension)
34{
35 const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
36 return doz(
37 stride_dimension * (input_dimension - 1) + adjustment_dimension + effective_kernel_dimension,
38 output_padding_dimension);
39}
40
41enum xnn_status xnn_create_deconvolution2d_nhwc_q8(
42 uint32_t output_padding_top,
43 uint32_t output_padding_right,
44 uint32_t output_padding_bottom,
45 uint32_t output_padding_left,
XNNPACK Teamb455b122019-09-27 18:10:33 -070046 uint32_t kernel_height,
47 uint32_t kernel_width,
48 uint32_t stride_height,
49 uint32_t stride_width,
50 uint32_t dilation_height,
51 uint32_t dilation_width,
52 uint32_t groups,
53 size_t group_input_channels,
54 size_t group_output_channels,
55 size_t input_pixel_stride,
56 size_t output_pixel_stride,
57 uint8_t input_zero_point,
58 float input_scale,
59 uint8_t kernel_zero_point,
60 float kernel_scale,
61 const uint8_t* kernel,
62 const int32_t* bias,
63 uint8_t output_zero_point,
64 float output_scale,
65 uint8_t output_min,
66 uint8_t output_max,
67 uint32_t flags,
68 xnn_operator_t* deconvolution_op_out)
69{
70 xnn_operator_t deconvolution_op = NULL;
71 enum xnn_status status = xnn_status_uninitialized;
72
73 if (!xnn_params.initialized) {
74 xnn_log_error("failed to create Deconvolution operator: XNNPACK is not initialized");
75 goto error;
76 }
77
78 status = xnn_status_invalid_parameter;
79
80 if (kernel_width == 0 || kernel_height == 0) {
81 xnn_log_error(
82 "failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
83 kernel_width, kernel_height);
84 goto error;
85 }
86
87 if (stride_width == 0 || stride_height == 0) {
88 xnn_log_error(
89 "failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
90 stride_width, stride_height);
91 goto error;
92 }
93
94 if (dilation_width == 0 || dilation_height == 0) {
95 xnn_log_error(
96 "failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
97 "dilation dimensions must be non-zero",
98 dilation_width, dilation_height);
99 goto error;
100 }
101
102 if (groups == 0) {
103 xnn_log_error(
104 "failed to create Deconvolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
105 goto error;
106 }
107
108 if (group_input_channels == 0) {
109 xnn_log_error(
110 "failed to create Deconvolution operator with %zu input channels per group: "
111 "number of channels must be non-zero",
112 group_input_channels);
113 goto error;
114 }
115
116 if (group_output_channels == 0) {
117 xnn_log_error(
118 "failed to create Deconvolution operator with %zu output channels per group: "
119 "number of channels must be non-zero",
120 group_output_channels);
121 goto error;
122 }
123
124 const size_t input_channels = groups * group_input_channels;
125 if (input_pixel_stride < input_channels) {
126 xnn_log_error(
127 "failed to create Deconvolution operator with input pixel stride of %zu: "
128 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
129 input_pixel_stride, groups, group_input_channels);
130 goto error;
131 }
132
133 const size_t output_channels = groups * group_output_channels;
134 if (output_pixel_stride < output_channels) {
135 xnn_log_error(
136 "failed to create Deconvolution operator with output pixel stride of %zu: "
137 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
138 output_pixel_stride, groups, group_output_channels);
139 goto error;
140 }
141
142 if (input_scale <= 0.0f || !isnormal(input_scale)) {
143 xnn_log_error(
144 "failed to create Deconvolution operator with %.7g input scale: scale must be finite, normalized, and positive",
145 input_scale);
146 goto error;
147 }
148
149 if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
150 xnn_log_error(
151 "failed to create Deconvolution operator with %.7g kernel scale: scale must be finite, normalized, and positive",
152 kernel_scale);
153 goto error;
154 }
155
156 if (output_scale <= 0.0f || !isnormal(output_scale)) {
157 xnn_log_error(
158 "failed to create Deconvolution operator with %.7g output scale: scale must be finite, normalized, and positive",
159 output_scale);
160 goto error;
161 }
162
163 if (output_min >= output_max) {
164 xnn_log_error(
165 "failed to create Deconvolution operator with [%" PRIu8 ", %" PRIu8 "] output range: "
166 "range min must be below range max",
167 output_min, output_max);
168 goto error;
169 }
170
171 status = xnn_status_unsupported_parameter;
172
173 const float deconvolution_scale = input_scale * kernel_scale / output_scale;
174 if (deconvolution_scale >= 1.0f) {
175 xnn_log_error(
176 "failed to create Deconvolution operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
177 "Deconvolution operator scale %.7g is greater or equal to 1.0",
178 input_scale, kernel_scale, output_scale, deconvolution_scale);
179 goto error;
180 }
181
182 status = xnn_status_out_of_memory;
183
Marat Dukhan04f03be2019-11-19 12:36:47 -0800184 deconvolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700185 if (deconvolution_op == NULL) {
186 xnn_log_error("failed to allocate %zu bytes for Deconvolution operator descriptor", sizeof(struct xnn_operator));
187 goto error;
188 }
189
190 const uint32_t mr = xnn_params.q8.gemm.mr;
191 const uint32_t nr = xnn_params.q8.gemm.nr;
192 const uint32_t kr = UINT32_C(1) << xnn_params.q8.gemm.log2_kr;
193 const xnn_igemm_ukernel_function ukernel_function = xnn_params.q8.gemm.igemm;
194
195 const uint32_t n_stride = round_up(group_output_channels, nr);
196 const uint32_t k_stride = round_up_po2(group_input_channels, kr);
197 const uint32_t kernel_size = kernel_height * kernel_width;
198 enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_igemm;
199 size_t packed_group_weights_size = (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride;
200 if (max(stride_height, stride_width) > 1 && max(dilation_height, dilation_width) == 1 && stride_width <= kernel_width && stride_height <= kernel_height) {
201 ukernel_type = xnn_ukernel_type_subconv2d;
202 const size_t subkernels = stride_height * stride_width;
203 packed_group_weights_size = n_stride *
204 (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t) * subkernels);
205
206 const size_t subconvolution_buffer_size = sizeof(struct subconvolution_params) * subkernels;
Marat Dukhan04f03be2019-11-19 12:36:47 -0800207 deconvolution_op->subconvolution_buffer = xnn_allocate_zero_simd_memory(subconvolution_buffer_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700208 if (deconvolution_op->subconvolution_buffer == NULL) {
209 xnn_log_error("failed to allocate %zu bytes for subconvolution buffer", subconvolution_buffer_size);
210 goto error;
211 }
212
213 struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
214 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
215 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
216 const size_t subkernel_height = divide_round_up(kernel_height - offset_y, stride_height);
217 const size_t subkernel_width = divide_round_up(kernel_width - offset_x, stride_width);
218 const size_t subkernel_size = subkernel_height * subkernel_width;
219
220 subconvolution_params->indirection_x_stride = sizeof(void*) * subkernel_size;
221 subconvolution_params->w_stride = sizeof(int32_t) + k_stride * subkernel_size * sizeof(uint8_t);
222 subconvolution_params++;
223 }
224 }
225 }
Marat Dukhan04f03be2019-11-19 12:36:47 -0800226 deconvolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700227 if (deconvolution_op->packed_weights == NULL) {
228 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
229 goto error;
230 }
231 memset(deconvolution_op->packed_weights, kernel_zero_point, packed_group_weights_size * groups);
232
233 switch (ukernel_type) {
234 case xnn_ukernel_type_igemm:
235 xnn_pack_q8_conv_goki_w(
236 groups, group_output_channels, kernel_size, group_input_channels,
237 nr, kr,
238 input_zero_point, kernel_zero_point,
239 kernel, bias, deconvolution_op->packed_weights);
240 break;
241 case xnn_ukernel_type_subconv2d:
242 xnn_pack_q8_deconv_goki_w(
243 groups, group_output_channels, kernel_height, kernel_width, group_input_channels,
244 stride_height, stride_width,
245 nr, kr,
246 input_zero_point, kernel_zero_point,
247 kernel, bias, deconvolution_op->packed_weights, deconvolution_op->subconvolution_buffer);
248 break;
249 default:
250 XNN_UNREACHABLE;
251 }
252
253 size_t zero_size = sizeof(uint8_t) * k_stride + XNN_EXTRA_BYTES;
Marat Dukhan04f03be2019-11-19 12:36:47 -0800254 void* zero_buffer = xnn_allocate_simd_memory(zero_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700255 if (zero_buffer == NULL) {
256 xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
257 goto error;
258 }
259 memset(zero_buffer, input_zero_point, zero_size);
260 deconvolution_op->zero_buffer = zero_buffer;
261
262 deconvolution_op->padding_top = output_padding_top;
263 deconvolution_op->padding_right = output_padding_right;
264 deconvolution_op->padding_bottom = output_padding_bottom;
265 deconvolution_op->padding_left = output_padding_left;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700266
267 deconvolution_op->kernel_height = kernel_height;
268 deconvolution_op->kernel_width = kernel_width;
269 deconvolution_op->stride_height = stride_height;
270 deconvolution_op->stride_width = stride_width;
271 deconvolution_op->dilation_height = dilation_height;
272 deconvolution_op->dilation_width = dilation_width;
273 deconvolution_op->groups = groups;
274 deconvolution_op->group_input_channels = group_input_channels;
275 deconvolution_op->group_output_channels = group_output_channels;
276 deconvolution_op->input_pixel_stride = input_pixel_stride;
277 deconvolution_op->output_pixel_stride = output_pixel_stride;
278
279 deconvolution_op->kernel_zero_point = kernel_zero_point;
280
281 deconvolution_op->q8_gemm_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700282 xnn_init_q8_gemm_params(
XNNPACK Teamb455b122019-09-27 18:10:33 -0700283 input_zero_point, kernel_zero_point,
284 deconvolution_scale, output_zero_point, output_min, output_max);
285
Marat Dukhanefc47b82019-11-18 09:25:38 -0800286 deconvolution_op->type = xnn_operator_type_deconvolution_nhwc_q8;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700287 deconvolution_op->ukernel.type = ukernel_type;
288 deconvolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
289 .default_function = ukernel_function,
290 .mr = mr,
291 .nr = nr,
292 .kr = kr,
293 };
294
295 deconvolution_op->state = xnn_run_state_invalid;
296
297 *deconvolution_op_out = deconvolution_op;
298 return xnn_status_success;
299
300error:
301 xnn_delete_operator(deconvolution_op);
302 return status;
303}
304
305enum xnn_status xnn_create_deconvolution2d_nhwc_f32(
306 uint32_t output_padding_top,
307 uint32_t output_padding_right,
308 uint32_t output_padding_bottom,
309 uint32_t output_padding_left,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700310 uint32_t kernel_height,
311 uint32_t kernel_width,
312 uint32_t stride_height,
313 uint32_t stride_width,
314 uint32_t dilation_height,
315 uint32_t dilation_width,
316 uint32_t groups,
317 size_t group_input_channels,
318 size_t group_output_channels,
319 size_t input_pixel_stride,
320 size_t output_pixel_stride,
321 const float* kernel,
322 const float* bias,
323 float output_min,
324 float output_max,
325 uint32_t flags,
326 xnn_operator_t* deconvolution_op_out)
327{
328 xnn_operator_t deconvolution_op = NULL;
329 enum xnn_status status = xnn_status_uninitialized;
330
331 if (!xnn_params.initialized) {
332 xnn_log_error("failed to create Deconvolution operator: XNNPACK is not initialized");
333 goto error;
334 }
335
336 status = xnn_status_invalid_parameter;
337
338 if (kernel_width == 0 || kernel_height == 0) {
339 xnn_log_error(
340 "failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
341 kernel_width, kernel_height);
342 goto error;
343 }
344
345 if (stride_width == 0 || stride_height == 0) {
346 xnn_log_error(
347 "failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " stride: stride dimensions must be non-zero",
348 stride_width, stride_height);
349 goto error;
350 }
351
352 if (dilation_width == 0 || dilation_height == 0) {
353 xnn_log_error(
354 "failed to create Deconvolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
355 "dilation dimensions must be non-zero",
356 dilation_width, dilation_height);
357 goto error;
358 }
359
360 if (groups == 0) {
361 xnn_log_error(
362 "failed to create Deconvolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
363 goto error;
364 }
365
366 if (group_input_channels == 0) {
367 xnn_log_error(
368 "failed to create Deconvolution operator with %zu input channels per group: "
369 "number of channels must be non-zero",
370 group_input_channels);
371 goto error;
372 }
373
374 if (group_output_channels == 0) {
375 xnn_log_error(
376 "failed to create Deconvolution operator with %zu output channels per group: "
377 "number of channels must be non-zero",
378 group_output_channels);
379 goto error;
380 }
381
382 const size_t input_channels = groups * group_input_channels;
383 if (input_pixel_stride < input_channels) {
384 xnn_log_error(
385 "failed to create Deconvolution operator with input pixel stride of %zu: "
386 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
387 input_pixel_stride, groups, group_input_channels);
388 goto error;
389 }
390
391 const size_t output_channels = groups * group_output_channels;
392 if (output_pixel_stride < output_channels) {
393 xnn_log_error(
394 "failed to create Deconvolution operator with output pixel stride of %zu: "
395 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
396 output_pixel_stride, groups, group_output_channels);
397 goto error;
398 }
399
400 if (isnan(output_min)) {
401 xnn_log_error(
402 "failed to create Deconvolution operator with NaN output lower bound: lower bound must be non-NaN");
403 goto error;
404 }
405
406 if (isnan(output_max)) {
407 xnn_log_error(
408 "failed to create Deconvolution operator with NaN output upper bound: upper bound must be non-NaN");
409 goto error;
410 }
411
412 if (output_min >= output_max) {
413 xnn_log_error(
414 "failed to create Deconvolution operator with [%.7g, %.7g] output range: "
415 "lower bound must be below upper bound",
416 output_min, output_max);
417 goto error;
418 }
419
420 status = xnn_status_out_of_memory;
421
Marat Dukhan04f03be2019-11-19 12:36:47 -0800422 deconvolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700423 if (deconvolution_op == NULL) {
424 xnn_log_error("failed to allocate %zu bytes for Deconvolution operator descriptor", sizeof(struct xnn_operator));
425 goto error;
426 }
427
428 uint32_t mr = xnn_params.f32.gemm.mr;
429 uint32_t nr = xnn_params.f32.gemm.nr;
430 uint32_t kr = UINT32_C(1) << xnn_params.f32.gemm.log2_kr;
431 uint32_t sr = UINT32_C(1) << xnn_params.f32.gemm.log2_sr;
432 xnn_igemm_ukernel_function ukernel_function = xnn_params.f32.gemm.igemm;
433 if (nr > group_output_channels) {
434 // Default micro-kernel is suboptimal. Try to find a better micro-kernel.
435 if (xnn_params.f32.gemm2.igemm != NULL) {
436 mr = xnn_params.f32.gemm2.mr;
437 nr = xnn_params.f32.gemm2.nr;
438 kr = UINT32_C(1) << xnn_params.f32.gemm2.log2_kr;
439 sr = UINT32_C(1) << xnn_params.f32.gemm2.log2_sr;
440 ukernel_function = xnn_params.f32.gemm2.igemm;
441 }
442 }
443
444 const uint32_t n_stride = round_up(group_output_channels, nr);
445 const uint32_t k_stride = round_up_po2(group_input_channels, kr);
446 const uint32_t kernel_size = kernel_height * kernel_width;
447 enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_igemm;
448 size_t packed_group_weights_size = (sizeof(float) * kernel_size * k_stride + sizeof(float)) * n_stride;
449 if (max(stride_height, stride_width) > 1 && max(dilation_height, dilation_width) == 1 && stride_width <= kernel_width && stride_height <= kernel_height) {
450 ukernel_type = xnn_ukernel_type_subconv2d;
451 const size_t subkernels = stride_height * stride_width;
452 packed_group_weights_size = n_stride *
453 (sizeof(float) * kernel_size * k_stride + sizeof(float) * subkernels);
454
455 const size_t subconvolution_buffer_size = sizeof(struct subconvolution_params) * subkernels;
Marat Dukhan04f03be2019-11-19 12:36:47 -0800456 deconvolution_op->subconvolution_buffer = xnn_allocate_zero_simd_memory(subconvolution_buffer_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700457 if (deconvolution_op->subconvolution_buffer == NULL) {
458 xnn_log_error("failed to allocate %zu bytes for subconvolution buffer", subconvolution_buffer_size);
459 goto error;
460 }
461
462 struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
463 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
464 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
465 const size_t subkernel_height = divide_round_up(kernel_height - offset_y, stride_height);
466 const size_t subkernel_width = divide_round_up(kernel_width - offset_x, stride_width);
467 const size_t subkernel_size = subkernel_height * subkernel_width;
468
469 subconvolution_params->indirection_x_stride = sizeof(void*) * subkernel_size;
470 subconvolution_params->w_stride = sizeof(float) + k_stride * subkernel_size * sizeof(float);
471 subconvolution_params++;
472 }
473 }
474 }
Marat Dukhan04f03be2019-11-19 12:36:47 -0800475 deconvolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700476 if (deconvolution_op->packed_weights == NULL) {
477 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
478 goto error;
479 }
480 memset(deconvolution_op->packed_weights, 0, packed_group_weights_size * groups);
481
482 switch (ukernel_type) {
483 case xnn_ukernel_type_igemm:
484 xnn_pack_f32_conv_goki_w(
485 groups, group_output_channels, kernel_size, group_input_channels,
486 nr, kr, sr,
487 kernel, bias, deconvolution_op->packed_weights);
488 break;
489 case xnn_ukernel_type_subconv2d:
490 xnn_pack_f32_deconv_goki_w(
491 groups, group_output_channels, kernel_height, kernel_width, group_input_channels,
492 stride_height, stride_width,
Marat Dukhanc4ae7de2019-10-25 02:06:26 -0700493 nr, kr, sr,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700494 kernel, bias, deconvolution_op->packed_weights, deconvolution_op->subconvolution_buffer);
495 break;
496 default:
497 XNN_UNREACHABLE;
498 }
499
500 const size_t zero_size = k_stride * sizeof(float) + XNN_EXTRA_BYTES;
Marat Dukhan04f03be2019-11-19 12:36:47 -0800501 void* zero_buffer = xnn_allocate_zero_simd_memory(zero_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700502 if (zero_buffer == NULL) {
503 xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
504 goto error;
505 }
506 deconvolution_op->zero_buffer = zero_buffer;
507
508 deconvolution_op->padding_top = output_padding_top;
509 deconvolution_op->padding_right = output_padding_right;
510 deconvolution_op->padding_bottom = output_padding_bottom;
511 deconvolution_op->padding_left = output_padding_left;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700512
513 deconvolution_op->kernel_height = kernel_height;
514 deconvolution_op->kernel_width = kernel_width;
515 deconvolution_op->stride_height = stride_height;
516 deconvolution_op->stride_width = stride_width;
517 deconvolution_op->dilation_height = dilation_height;
518 deconvolution_op->dilation_width = dilation_width;
519 deconvolution_op->groups = groups;
520 deconvolution_op->group_input_channels = group_input_channels;
521 deconvolution_op->group_output_channels = group_output_channels;
522 deconvolution_op->input_pixel_stride = input_pixel_stride;
523 deconvolution_op->output_pixel_stride = output_pixel_stride;
524
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700525 deconvolution_op->f32_output_params = xnn_init_f32_output_params(output_min, output_max);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700526
Marat Dukhanefc47b82019-11-18 09:25:38 -0800527 deconvolution_op->type = xnn_operator_type_deconvolution_nhwc_f32;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700528 deconvolution_op->ukernel.type = ukernel_type;
529 deconvolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
530 .default_function = ukernel_function,
531 .mr = mr,
532 .nr = nr,
533 .kr = kr,
534 };
535
536 deconvolution_op->state = xnn_run_state_invalid;
537
538 *deconvolution_op_out = deconvolution_op;
539 return xnn_status_success;
540
541error:
542 xnn_delete_operator(deconvolution_op);
543 return status;
544}
545
546static enum xnn_status setup_conv_path(
547 xnn_operator_t deconvolution_op,
548 size_t batch_size,
549 size_t input_height,
550 size_t input_width,
551 const void* input,
552 size_t output_height,
553 size_t output_width,
554 void* output,
555 uint32_t log2_input_element_size,
556 uint32_t log2_filter_element_size,
557 uint32_t bias_element_size,
558 uint32_t log2_output_element_size,
559 const void* params,
560 size_t num_threads)
561{
562 assert(deconvolution_op->ukernel.type == xnn_ukernel_type_igemm);
563
564 const size_t kernel_height = deconvolution_op->kernel_height;
565 const size_t kernel_width = deconvolution_op->kernel_width;
566 const size_t kernel_size = kernel_height * kernel_width;
567
568 const size_t groups = deconvolution_op->groups;
569 const size_t output_size = output_height * output_width;
570 const size_t mr = deconvolution_op->ukernel.igemm.mr;
571 const size_t tiled_output_size = round_up(output_size, mr);
572 const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
573
574 if (input_height != deconvolution_op->last_input_height ||
575 input_width != deconvolution_op->last_input_width)
576 {
Marat Dukhan04f03be2019-11-19 12:36:47 -0800577 const void** indirection_buffer = (const void**) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700578 if (indirection_buffer == NULL) {
579 xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
580 return xnn_status_out_of_memory;
581 }
582 deconvolution_op->indirection_buffer = indirection_buffer;
583 deconvolution_op->last_input = input;
584 deconvolution_op->last_input_height = input_height;
585 deconvolution_op->last_input_width = input_width;
586
587 xnn_indirection_init_deconv2d(deconvolution_op, mr, log2_input_element_size);
588 }
589
590 const size_t group_input_channels = deconvolution_op->group_input_channels;
591 const size_t group_output_channels = deconvolution_op->group_output_channels;
592 const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
593 const size_t w_stride = bias_element_size +
594 (round_up_po2(group_input_channels, deconvolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size);
595 deconvolution_op->context.igemm = (struct igemm_context) {
596 .ks = kernel_size,
597 .ks_scaled = kernel_size * mr * sizeof(void*),
598 .kc = group_input_channels << log2_input_element_size,
599 .w_stride = w_stride,
600 .indirect_a = deconvolution_op->indirection_buffer,
601 .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
602 .zero = deconvolution_op->zero_buffer,
603 .packed_w = deconvolution_op->packed_weights,
604 .c = deconvolution_op->output,
605 .cm_stride = deconvolution_op->output_pixel_stride << log2_output_element_size,
606 .cn_stride = nr << log2_output_element_size,
607 .ga_stride = group_input_channels << log2_input_element_size,
608 .gw_stride = w_stride * round_up(group_output_channels, nr),
609 .gc_stride = group_output_channels << log2_output_element_size,
610 .ba_stride = input_height * input_width * deconvolution_op->input_pixel_stride << log2_input_element_size,
611 .bc_stride = output_size * deconvolution_op->output_pixel_stride << log2_output_element_size,
612 .log2_csize = log2_output_element_size,
613 .ukernel = deconvolution_op->ukernel.igemm.default_function,
614 };
615 if (output_size == 1 && deconvolution_op->ukernel.igemm.mr1_function != NULL) {
616 deconvolution_op->context.igemm.ukernel = deconvolution_op->ukernel.igemm.mr1_function;
617 }
618 memcpy(&deconvolution_op->context.igemm.params, params, sizeof(deconvolution_op->context.igemm.params));
619
620 size_t nc = group_output_channels;
621 if (num_threads > 1) {
622 const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
623 const size_t target_tiles_per_thread = 5;
624 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
625 if (max_nc < nc) {
626 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
627 }
628 }
629 if (groups == 1) {
630 deconvolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
631 deconvolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_igemm;
632 deconvolution_op->compute.range[0] = batch_size;
633 deconvolution_op->compute.range[1] = output_size;
634 deconvolution_op->compute.range[2] = group_output_channels;
635 deconvolution_op->compute.tile[0] = mr;
636 deconvolution_op->compute.tile[1] = nc;
637 } else {
638 deconvolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
639 deconvolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_gigemm;
640 deconvolution_op->compute.range[0] = batch_size;
641 deconvolution_op->compute.range[1] = groups;
642 deconvolution_op->compute.range[2] = output_size;
643 deconvolution_op->compute.range[3] = group_output_channels;
644 deconvolution_op->compute.tile[0] = mr;
645 deconvolution_op->compute.tile[1] = nc;
646 }
647 deconvolution_op->state = xnn_run_state_ready;
648 return xnn_status_success;
649}
650
651static enum xnn_status setup_subconv2d_path(
652 xnn_operator_t deconvolution_op,
653 size_t batch_size,
654 size_t input_height,
655 size_t input_width,
656 const void* input,
657 size_t output_height,
658 size_t output_width,
659 void* output,
660 uint32_t log2_input_element_size,
661 uint32_t log2_filter_element_size,
662 uint32_t bias_element_size,
663 uint32_t log2_output_element_size,
664 const void* params,
665 size_t num_threads)
666{
667 assert(deconvolution_op->ukernel.type == xnn_ukernel_type_subconv2d);
668
669 const size_t kernel_height = deconvolution_op->kernel_height;
670 const size_t kernel_width = deconvolution_op->kernel_width;
671 const size_t kernel_size = kernel_height * kernel_width;
672 const size_t stride_height = deconvolution_op->stride_height;
673 const size_t stride_width = deconvolution_op->stride_width;
674
675 const size_t groups = deconvolution_op->groups;
676 const size_t output_size = output_height * output_width;
677 const size_t mr = deconvolution_op->ukernel.igemm.mr;
678 const size_t indirection_buffer_size =
679 sizeof(void*) * kernel_size * output_height * stride_width * round_up(divide_round_up(output_width, stride_width), mr);
680
681 if (input_height != deconvolution_op->last_input_height ||
682 input_width != deconvolution_op->last_input_width)
683 {
Marat Dukhan04f03be2019-11-19 12:36:47 -0800684 const void** indirection_buffer = (const void**) xnn_reallocate_memory(deconvolution_op->indirection_buffer, indirection_buffer_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700685 if (indirection_buffer == NULL) {
686 xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
687 return xnn_status_out_of_memory;
688 }
689 deconvolution_op->indirection_buffer = indirection_buffer;
690 deconvolution_op->last_input = input;
691 deconvolution_op->last_input_height = input_height;
692 deconvolution_op->last_input_width = input_width;
693
694 xnn_indirection_init_subconv2d(deconvolution_op, mr, log2_input_element_size);
695
696 // Initialize subconvolution parameters which depend on output dimensions or MR.
697 struct subconvolution_params* subconvolution_params = deconvolution_op->subconvolution_buffer;
698 const size_t modulo_padding_top = deconvolution_op->padding_top % stride_height;
699 const size_t modulo_padding_left = deconvolution_op->padding_left % stride_width;
700 const size_t output_pixel_stride = deconvolution_op->output_pixel_stride << log2_output_element_size;
701 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
702 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
703 const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
704 const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
705 subconvolution_params->scaled_kernel_size = mr * subconvolution_params->indirection_x_stride;
706 subconvolution_params->slice_width = divide_round_up(output_width - output_x_start, stride_width);
707 subconvolution_params->slice_height = divide_round_up(output_height - output_y_start, stride_height);
708 subconvolution_params->output =
709 (void*) ((uintptr_t) output + ((output_y_start * output_width + output_x_start) * output_pixel_stride));
710 ++subconvolution_params;
711 }
712 }
713 }
714
715 const size_t group_input_channels = deconvolution_op->group_input_channels;
716 const size_t group_output_channels = deconvolution_op->group_output_channels;
717 const uint32_t nr = deconvolution_op->ukernel.igemm.nr;
718 const size_t w_stride = stride_height * stride_width * bias_element_size +
719 (round_up_po2(group_input_channels, deconvolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size);
720 deconvolution_op->context.subconv = (struct subconv_context) {
721 .subconvolution_params = deconvolution_op->subconvolution_buffer,
722 .kc = group_input_channels << log2_input_element_size,
723 .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) deconvolution_op->last_input),
724 .zero = deconvolution_op->zero_buffer,
725 .cx_stride = stride_width * deconvolution_op->output_pixel_stride << log2_output_element_size,
726 .cy_stride = stride_height * output_width * deconvolution_op->output_pixel_stride << log2_output_element_size,
727 .cn_stride = nr << log2_output_element_size,
728 .ga_stride = group_input_channels << log2_input_element_size,
729 .gw_stride = w_stride * round_up(group_output_channels, nr),
730 .gc_stride = group_output_channels << log2_output_element_size,
731 .ba_stride = input_height * input_width * deconvolution_op->input_pixel_stride << log2_input_element_size,
732 .bc_stride = output_size * deconvolution_op->output_pixel_stride << log2_output_element_size,
733 .log2_csize = log2_output_element_size,
734 .ukernel = deconvolution_op->ukernel.igemm.default_function,
735 };
736 memcpy(&deconvolution_op->context.subconv.params, params, sizeof(deconvolution_op->context.subconv.params));
737
738 const size_t output_height_positions = divide_round_up(output_height, stride_height);
739 const size_t output_width_positions = divide_round_up(output_width, stride_width);
740
741 size_t nc = group_output_channels;
742 if (num_threads > 1) {
743 const size_t num_other_tiles = groups * stride_height * stride_width *
744 output_height_positions * divide_round_up(output_width_positions, mr);
745 const size_t target_tiles_per_thread = 5;
746 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
747 if (max_nc < nc) {
748 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
749 }
750 }
751
752 if (groups == 1) {
753 deconvolution_op->compute.type = xnn_parallelization_type_5d_tile_2d;
754 deconvolution_op->compute.task_5d_tile_2d = (pthreadpool_task_5d_tile_2d_t) xnn_compute_subconv2d;
755 deconvolution_op->compute.range[0] = batch_size;
756 deconvolution_op->compute.range[1] = stride_height * stride_width;
757 deconvolution_op->compute.range[2] = divide_round_up(output_height, stride_height);
758 deconvolution_op->compute.range[3] = divide_round_up(output_width, stride_width);
759 deconvolution_op->compute.range[4] = group_output_channels;
760 deconvolution_op->compute.tile[0] = mr;
761 deconvolution_op->compute.tile[1] = nc;
762 } else {
763 deconvolution_op->compute.type = xnn_parallelization_type_6d_tile_2d;
764 deconvolution_op->compute.task_6d_tile_2d = (pthreadpool_task_6d_tile_2d_t) xnn_compute_gsubconv2d;
765 deconvolution_op->compute.range[0] = batch_size;
766 deconvolution_op->compute.range[1] = groups;
767 deconvolution_op->compute.range[2] = stride_height * stride_width;
768 deconvolution_op->compute.range[3] = divide_round_up(output_height, stride_height);
769 deconvolution_op->compute.range[4] = divide_round_up(output_width, stride_width);
770 deconvolution_op->compute.range[5] = group_output_channels;
771 deconvolution_op->compute.tile[0] = mr;
772 deconvolution_op->compute.tile[1] = nc;
773 }
774
775 deconvolution_op->state = xnn_run_state_ready;
776 return xnn_status_success;
777}
778
779static enum xnn_status setup_deconvolution2d(
780 xnn_operator_t deconvolution_op,
781 size_t batch_size,
782 size_t input_height,
783 size_t input_width,
Marat Dukhan1898b912019-11-05 12:25:18 -0800784 uint32_t adjustment_height,
785 uint32_t adjustment_width,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700786 const void* input,
787 void* output,
788 uint32_t log2_input_element_size,
789 uint32_t log2_filter_element_size,
790 uint32_t bias_element_size,
791 uint32_t log2_output_element_size,
792 const void* params,
793 size_t num_threads)
794{
795 deconvolution_op->state = xnn_run_state_invalid;
796
797 if (!xnn_params.initialized) {
798 xnn_log_error("failed to setup Deconvolution operator: XNNPACK is not initialized");
799 return xnn_status_uninitialized;
800 }
801
802 if (input_width == 0 || input_height == 0) {
803 xnn_log_error(
804 "failed to setup Deconvolution with %zux%zu input: input dimensions must be non-zero",
805 input_width, input_height);
806 return xnn_status_invalid_parameter;
807 }
808
Marat Dukhan1898b912019-11-05 12:25:18 -0800809 if (adjustment_height >= deconvolution_op->stride_height) {
810 xnn_log_error(
811 "failed to setup Deconvolution with %" PRIu32 " height adjustment: "
812 "height adjustment must be smaller than height stride (%" PRIu32 ")",
813 adjustment_height, deconvolution_op->stride_height);
814 return xnn_status_invalid_parameter;
815 }
816
817 if (adjustment_width >= deconvolution_op->stride_width) {
818 xnn_log_error(
819 "failed to setup Deconvolution with %" PRIu32 " width adjustment: "
820 "width adjustment must be smaller than width stride (%" PRIu32 ")",
821 adjustment_width, deconvolution_op->stride_width);
822 return xnn_status_invalid_parameter;
823 }
824
XNNPACK Teamb455b122019-09-27 18:10:33 -0700825 if (batch_size == 0) {
826 deconvolution_op->state = xnn_run_state_skip;
827 return xnn_status_success;
828 }
829
830 deconvolution_op->batch_size = batch_size;
831 deconvolution_op->input_height = input_height;
832 deconvolution_op->input_width = input_width;
833 deconvolution_op->input = input;
834 deconvolution_op->output = output;
835
836 const size_t output_height = deconvolution_op->output_height = compute_output_dimension(
837 input_height, deconvolution_op->padding_top + deconvolution_op->padding_bottom,
Marat Dukhan1898b912019-11-05 12:25:18 -0800838 adjustment_height, deconvolution_op->kernel_height, deconvolution_op->dilation_height, deconvolution_op->stride_height);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700839 const size_t output_width = deconvolution_op->output_width = compute_output_dimension(
840 input_width, deconvolution_op->padding_left + deconvolution_op->padding_right,
Marat Dukhan1898b912019-11-05 12:25:18 -0800841 adjustment_width, deconvolution_op->kernel_width, deconvolution_op->dilation_width, deconvolution_op->stride_width);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700842
843 switch (deconvolution_op->ukernel.type) {
844 case xnn_ukernel_type_igemm:
845 return setup_conv_path(
846 deconvolution_op,
847 batch_size,
848 input_height, input_width, input,
849 output_height, output_width, output,
850 log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
851 params, num_threads);
852 case xnn_ukernel_type_subconv2d:
853 return setup_subconv2d_path(
854 deconvolution_op,
855 batch_size,
856 input_height, input_width, input,
857 output_height, output_width, output,
858 log2_input_element_size, log2_filter_element_size, bias_element_size, log2_output_element_size,
859 params, num_threads);
860 default:
861 XNN_UNREACHABLE;
862 }
863}
864
865enum xnn_status xnn_setup_deconvolution2d_nhwc_q8(
866 xnn_operator_t deconvolution_op,
867 size_t batch_size,
868 size_t input_height,
869 size_t input_width,
Marat Dukhan1898b912019-11-05 12:25:18 -0800870 uint32_t adjustment_height,
871 uint32_t adjustment_width,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700872 const uint8_t* input,
873 uint8_t* output,
874 pthreadpool_t threadpool)
875{
Marat Dukhanefc47b82019-11-18 09:25:38 -0800876 if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_q8) {
877 xnn_log_error("failed to setup Deconvolution (NHWC, Q8) operator: operator type mismatch");
XNNPACK Teamb455b122019-09-27 18:10:33 -0700878 return xnn_status_invalid_parameter;
879 }
880
881 return setup_deconvolution2d(
882 deconvolution_op,
883 batch_size, input_height, input_width,
Marat Dukhan1898b912019-11-05 12:25:18 -0800884 adjustment_height, adjustment_width,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700885 input, output,
886 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
887 0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
888 sizeof(int32_t) /* sizeof(bias element) */,
889 0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
890 &deconvolution_op->q8_gemm_params,
891 pthreadpool_get_threads_count(threadpool));
892}
893
894enum xnn_status xnn_setup_deconvolution2d_nhwc_f32(
895 xnn_operator_t deconvolution_op,
896 size_t batch_size,
897 size_t input_height,
898 size_t input_width,
Marat Dukhan1898b912019-11-05 12:25:18 -0800899 uint32_t adjustment_height,
900 uint32_t adjustment_width,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700901 const float* input,
902 float* output,
903 pthreadpool_t threadpool)
904{
Marat Dukhanefc47b82019-11-18 09:25:38 -0800905 if (deconvolution_op->type != xnn_operator_type_deconvolution_nhwc_f32) {
906 xnn_log_error("failed to setup Deconvolution (NHWC, F32) operator: operator type mismatch");
XNNPACK Teamb455b122019-09-27 18:10:33 -0700907 return xnn_status_invalid_parameter;
908 }
909
910 return setup_deconvolution2d(
911 deconvolution_op,
912 batch_size, input_height, input_width,
Marat Dukhan1898b912019-11-05 12:25:18 -0800913 adjustment_height, adjustment_width,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700914 input, output,
915 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
916 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
917 sizeof(float) /* sizeof(bias element) */,
918 2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
919 &deconvolution_op->f32_output_params,
920 pthreadpool_get_threads_count(threadpool));
921}