blob: bfe226e4b44e7f95b66d68232e66a469ed88e147 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <assert.h>
10#include <math.h>
11#include <stdbool.h>
12#include <stddef.h>
13#include <stdint.h>
14#include <stdlib.h>
15#include <string.h>
16
17#include <xnnpack.h>
18#include <xnnpack/allocator.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070019#include <xnnpack/common.h>
20#include <xnnpack/compute.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070021#include <xnnpack/indirection.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070022#include <xnnpack/log.h>
23#include <xnnpack/math.h>
24#include <xnnpack/operator.h>
25#include <xnnpack/pack.h>
26#include <xnnpack/params-init.h>
27#include <xnnpack/params.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070028
29
30static inline size_t compute_output_dimension(
31 size_t padded_input_dimension,
32 size_t kernel_dimension,
33 size_t dilation_dimension,
34 size_t subsampling_dimension)
35{
36 const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
37 return doz(padded_input_dimension, effective_kernel_dimension) / subsampling_dimension + 1;
38}
39
Marat Dukhan8440fde2019-10-24 12:46:13 -070040static inline size_t compute_output_dimension_with_tf_same_padding(
41 size_t input_dimension,
42 size_t subsampling_dimension)
43{
44 return divide_round_up(input_dimension, subsampling_dimension);
45}
46
XNNPACK Teamb455b122019-09-27 18:10:33 -070047static const struct dwconv_parameters* find_dwigemm_ukernel(
48 size_t kernel_size,
49 const struct dwconv_parameters* ukernel,
50 size_t num_ukernels)
51{
52 while (num_ukernels-- != 0) {
Marat Dukhanaefaef32020-04-09 07:09:34 -070053 if (ukernel->primary_tile == kernel_size) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070054 return ukernel;
55 }
56 ukernel++;
57 }
58 return NULL;
59}
60
61enum xnn_status xnn_create_convolution2d_nhwc_q8(
62 uint32_t input_padding_top,
63 uint32_t input_padding_right,
64 uint32_t input_padding_bottom,
65 uint32_t input_padding_left,
66 uint32_t kernel_height,
67 uint32_t kernel_width,
68 uint32_t subsampling_height,
69 uint32_t subsampling_width,
70 uint32_t dilation_height,
71 uint32_t dilation_width,
72 uint32_t groups,
73 size_t group_input_channels,
74 size_t group_output_channels,
75 size_t input_pixel_stride,
76 size_t output_pixel_stride,
77 uint8_t input_zero_point,
78 float input_scale,
79 uint8_t kernel_zero_point,
80 float kernel_scale,
81 const uint8_t* kernel,
82 const int32_t* bias,
83 uint8_t output_zero_point,
84 float output_scale,
85 uint8_t output_min,
86 uint8_t output_max,
87 uint32_t flags,
88 xnn_operator_t* convolution_op_out)
89{
90 xnn_operator_t convolution_op = NULL;
91 enum xnn_status status = xnn_status_uninitialized;
92
93 if (!xnn_params.initialized) {
94 xnn_log_error("failed to create Convolution operator: XNNPACK is not initialized");
95 goto error;
96 }
97
98 status = xnn_status_invalid_parameter;
99
100 if (kernel_width == 0 || kernel_height == 0) {
101 xnn_log_error(
102 "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
103 kernel_width, kernel_height);
104 goto error;
105 }
106
107 if (subsampling_width == 0 || subsampling_height == 0) {
108 xnn_log_error(
109 "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " subsampling: "
110 "subsampling dimensions must be non-zero",
111 subsampling_width, subsampling_height);
112 goto error;
113 }
114
115 if (dilation_width == 0 || dilation_height == 0) {
116 xnn_log_error(
117 "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
118 "dilation dimensions must be non-zero",
119 dilation_width, dilation_height);
120 goto error;
121 }
122
123 if (groups == 0) {
124 xnn_log_error(
125 "failed to create Convolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
126 goto error;
127 }
128
129 if (group_input_channels == 0) {
130 xnn_log_error(
131 "failed to create Convolution operator with %zu input channels per group: "
132 "number of channels must be non-zero",
133 group_input_channels);
134 goto error;
135 }
136
137 if (group_output_channels == 0) {
138 xnn_log_error(
139 "failed to create Convolution operator with %zu output channels per group: "
140 "number of channels must be non-zero",
141 group_output_channels);
142 goto error;
143 }
144
145 const size_t input_channels = groups * group_input_channels;
146 if (input_pixel_stride < input_channels) {
147 xnn_log_error(
148 "failed to create Convolution operator with input pixel stride of %zu: "
149 "stride must be at least as large as the number of input channels (%" PRIu32 "x%zu)",
150 input_pixel_stride, groups, group_input_channels);
151 goto error;
152 }
153
154 const size_t output_channels = groups * group_output_channels;
155 if (output_pixel_stride < output_channels) {
156 xnn_log_error(
157 "failed to create Convolution operator with output pixel stride of %zu: "
158 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
159 output_pixel_stride, groups, group_output_channels);
160 goto error;
161 }
162
163 if (input_scale <= 0.0f || !isnormal(input_scale)) {
164 xnn_log_error(
165 "failed to create Convolution operator with %.7g input scale: scale must be finite, normalized, and positive",
166 input_scale);
167 goto error;
168 }
169
170 if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
171 xnn_log_error(
172 "failed to create Convolution operator with %.7g kernel scale: scale must be finite, normalized, and positive",
173 kernel_scale);
174 goto error;
175 }
176
177 if (output_scale <= 0.0f || !isnormal(output_scale)) {
178 xnn_log_error(
179 "failed to create Convolution operator with %.7g output scale: scale must be finite, normalized, and positive",
180 output_scale);
181 goto error;
182 }
183
184 if (output_min >= output_max) {
185 xnn_log_error(
186 "failed to create Convolution operator with [%" PRIu8 ", %" PRIu8 "] output range: "
187 "range min must be below range max",
188 output_min, output_max);
189 goto error;
190 }
191
Marat Dukhandd69f0b2019-10-04 19:40:03 -0700192 if ((flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) != 0 && group_input_channels != 1) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700193 xnn_log_error(
194 "failed to create Depthwise Convolution operator with %zu input channels per group: "
195 "Depthwise Convolution must have exactly 1 input channel per group",
196 group_input_channels);
197 goto error;
198 }
199
Marat Dukhan8440fde2019-10-24 12:46:13 -0700200 const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
201 if ((flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
202 if (any_padding) {
203 xnn_log_error(
204 "failed to create Convolution operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
205 "TensorFlow SAME padding can't be combined with explicit padding specification",
206 input_padding_top, input_padding_left, input_padding_bottom, input_padding_right);
207 goto error;
208 }
209 }
210
XNNPACK Teamb455b122019-09-27 18:10:33 -0700211 status = xnn_status_unsupported_parameter;
212
XNNPACK Teamb455b122019-09-27 18:10:33 -0700213 const float convolution_scale = input_scale * kernel_scale / output_scale;
214 if (convolution_scale >= 1.0f) {
215 xnn_log_error(
216 "failed to create Convolution operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
217 "convolution scale %.7g is greater or equal to 1.0",
218 input_scale, kernel_scale, output_scale, convolution_scale);
219 goto error;
220 }
221
222 status = xnn_status_out_of_memory;
223
Marat Dukhan04f03be2019-11-19 12:36:47 -0800224 convolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700225 if (convolution_op == NULL) {
226 xnn_log_error("failed to allocate %zu bytes for Convolution operator descriptor", sizeof(struct xnn_operator));
227 goto error;
228 }
229
230 const size_t kernel_size = kernel_height * kernel_width;
231
232 enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_none;
233 const struct dwconv_parameters* dwconv_parameters = NULL;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700234 if (group_input_channels == 1 && group_output_channels == 1 && groups > 1 &&
235 (dwconv_parameters = find_dwigemm_ukernel(kernel_size, xnn_params.q8.dwconv, XNN_MAX_Q8_DWCONV_UKERNELS)) != NULL)
236 {
237 ukernel_type = xnn_ukernel_type_dwconv;
238 } else if (kernel_size == 1 && subsampling_height == 1 && subsampling_width == 1 && !any_padding) {
239 ukernel_type = xnn_ukernel_type_gemm;
240 } else {
241 ukernel_type = xnn_ukernel_type_igemm;
242 }
243
244 size_t zero_size = 0;
245 switch (ukernel_type) {
246 case xnn_ukernel_type_dwconv:
247 {
248 assert(dwconv_parameters != NULL);
Marat Dukhanaefaef32020-04-09 07:09:34 -0700249 assert(dwconv_parameters->primary_tile == kernel_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700250
Marat Dukhan0b10b9f2020-04-15 18:55:09 -0700251 const size_t c_stride = round_up_po2(groups, dwconv_parameters->channel_tile);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700252 const size_t packed_weights_size = (sizeof(uint8_t) * kernel_size + sizeof(int32_t)) * c_stride;
Marat Dukhan04f03be2019-11-19 12:36:47 -0800253 convolution_op->packed_weights = xnn_allocate_simd_memory(packed_weights_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700254 if (convolution_op->packed_weights == NULL) {
255 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_weights_size);
256 goto error;
257 }
258
Marat Dukhandd69f0b2019-10-04 19:40:03 -0700259 if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700260 xnn_pack_q8_dwconv_hwg_w(
261 kernel_height, kernel_width,
Marat Dukhanaefaef32020-04-09 07:09:34 -0700262 groups, dwconv_parameters->channel_tile,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700263 input_zero_point, kernel_zero_point,
264 kernel, bias, convolution_op->packed_weights);
265 } else {
266 xnn_pack_q8_dwconv_ghw_w(
267 kernel_height, kernel_width,
Marat Dukhanaefaef32020-04-09 07:09:34 -0700268 groups, dwconv_parameters->channel_tile,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700269 input_zero_point, kernel_zero_point,
270 kernel, bias, convolution_op->packed_weights);
271 }
272
273 convolution_op->ukernel.dwconv = (struct xnn_ukernel_dwconv) {
Marat Dukhanaefaef32020-04-09 07:09:34 -0700274 .unipass_function = dwconv_parameters->minmax.unipass,
275 .primary_tile = dwconv_parameters->primary_tile,
276 .incremental_tile = dwconv_parameters->incremental_tile,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700277 };
278
279 zero_size = sizeof(uint8_t) * c_stride + XNN_EXTRA_BYTES;
280 break;
281 }
282 case xnn_ukernel_type_gemm:
283 case xnn_ukernel_type_igemm:
284 {
285 const uint32_t nr = xnn_params.q8.gemm.nr;
286 const uint32_t kr = UINT32_C(1) << xnn_params.q8.gemm.log2_kr;
Marat Dukhan0b10b9f2020-04-15 18:55:09 -0700287 const size_t n_stride = round_up(group_output_channels, nr);
288 const size_t k_stride = round_up_po2(group_input_channels, kr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700289
290 const size_t packed_group_weights_size =
291 (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride;
Marat Dukhan04f03be2019-11-19 12:36:47 -0800292 convolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700293 if (convolution_op->packed_weights == NULL) {
294 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
295 goto error;
296 }
297 memset(convolution_op->packed_weights, kernel_zero_point, packed_group_weights_size * groups);
298
299 switch (ukernel_type) {
300 case xnn_ukernel_type_gemm:
301 xnn_pack_q8_gemm_goi_w(
302 groups, group_output_channels, group_input_channels,
303 nr, kr,
304 input_zero_point, kernel_zero_point,
305 kernel, bias, convolution_op->packed_weights);
306 convolution_op->ukernel.gemm = (struct xnn_ukernel_gemm) {
307 .mr = xnn_params.q8.gemm.mr,
308 .nr = nr,
309 .kr = kr,
Marat Dukhanaefaef32020-04-09 07:09:34 -0700310 .general_case = xnn_params.q8.gemm.minmax.gemm,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700311 };
312 break;
313 case xnn_ukernel_type_igemm:
Marat Dukhandd69f0b2019-10-04 19:40:03 -0700314 if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700315 xnn_pack_q8_conv_kgo_w(
316 groups, group_output_channels, kernel_size,
317 nr, kr,
318 input_zero_point, kernel_zero_point,
319 kernel, bias, convolution_op->packed_weights);
320 } else {
321 xnn_pack_q8_conv_goki_w(
322 groups, group_output_channels, kernel_size, group_input_channels,
323 nr, kr,
324 input_zero_point, kernel_zero_point,
325 kernel, bias, convolution_op->packed_weights);
326 }
327 convolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
328 .mr = xnn_params.q8.gemm.mr,
329 .nr = nr,
330 .kr = kr,
Marat Dukhanaefaef32020-04-09 07:09:34 -0700331 .general_case = xnn_params.q8.gemm.minmax.igemm,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700332 };
333 break;
334 default:
335 XNN_UNREACHABLE;
336 }
337
338 zero_size = sizeof(uint8_t) * k_stride + XNN_EXTRA_BYTES;
339 break;
340 }
341 default:
342 XNN_UNREACHABLE;
343 }
344
Marat Dukhan8440fde2019-10-24 12:46:13 -0700345 const bool tf_same_padding = (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0 && kernel_size != 1;
346 if (any_padding || tf_same_padding) {
Marat Dukhan04f03be2019-11-19 12:36:47 -0800347 void* zero_buffer = xnn_allocate_simd_memory(zero_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700348 if (zero_buffer == NULL) {
349 xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
350 goto error;
351 }
352 memset(zero_buffer, input_zero_point, zero_size);
353 convolution_op->zero_buffer = zero_buffer;
354 }
355
356 convolution_op->padding_top = input_padding_top;
357 convolution_op->padding_right = input_padding_right;
358 convolution_op->padding_bottom = input_padding_bottom;
359 convolution_op->padding_left = input_padding_left;
360
361 convolution_op->kernel_height = kernel_height;
362 convolution_op->kernel_width = kernel_width;
363 convolution_op->stride_height = subsampling_height;
364 convolution_op->stride_width = subsampling_width;
365 convolution_op->dilation_height = dilation_height;
366 convolution_op->dilation_width = dilation_width;
367 convolution_op->groups = groups;
368 convolution_op->group_input_channels = group_input_channels;
369 convolution_op->group_output_channels = group_output_channels;
370 convolution_op->input_pixel_stride = input_pixel_stride;
371 convolution_op->output_pixel_stride = output_pixel_stride;
372
373 convolution_op->kernel_zero_point = kernel_zero_point;
374
375 convolution_op->q8_gemm_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700376 xnn_init_q8_gemm_params(
XNNPACK Teamb455b122019-09-27 18:10:33 -0700377 input_zero_point, kernel_zero_point,
378 convolution_scale, output_zero_point, output_min, output_max);
379
Marat Dukhanefc47b82019-11-18 09:25:38 -0800380 convolution_op->type = xnn_operator_type_convolution_nhwc_q8;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700381 convolution_op->ukernel.type = ukernel_type;
Marat Dukhan8440fde2019-10-24 12:46:13 -0700382 if (tf_same_padding) {
383 convolution_op->flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
384 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700385
386 convolution_op->state = xnn_run_state_invalid;
387
388 *convolution_op_out = convolution_op;
389 return xnn_status_success;
390
391error:
392 xnn_delete_operator(convolution_op);
393 return status;
394}
395
396enum xnn_status xnn_create_convolution2d_nhwc_f32(
397 uint32_t input_padding_top,
398 uint32_t input_padding_right,
399 uint32_t input_padding_bottom,
400 uint32_t input_padding_left,
401 uint32_t kernel_height,
402 uint32_t kernel_width,
403 uint32_t subsampling_height,
404 uint32_t subsampling_width,
405 uint32_t dilation_height,
406 uint32_t dilation_width,
407 uint32_t groups,
408 size_t group_input_channels,
409 size_t group_output_channels,
410 size_t input_pixel_stride,
411 size_t output_pixel_stride,
412 const float* kernel,
413 const float* bias,
414 float output_min,
415 float output_max,
416 uint32_t flags,
417 xnn_operator_t* convolution_op_out)
418{
419 xnn_operator_t convolution_op = NULL;
420 enum xnn_status status = xnn_status_uninitialized;
421
422 if (!xnn_params.initialized) {
423 xnn_log_error("failed to create Convolution operator: XNNPACK is not initialized");
424 goto error;
425 }
426
427 status = xnn_status_invalid_parameter;
428
429 if (kernel_width == 0 || kernel_height == 0) {
430 xnn_log_error(
431 "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
432 kernel_width, kernel_height);
433 goto error;
434 }
435
436 if (subsampling_width == 0 || subsampling_height == 0) {
437 xnn_log_error(
438 "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " subsampling: "
439 "subsampling dimensions must be non-zero",
440 subsampling_width, subsampling_height);
441 goto error;
442 }
443
444 if (dilation_width == 0 || dilation_height == 0) {
445 xnn_log_error(
446 "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
447 "dilation dimensions must be non-zero",
448 dilation_width, dilation_height);
449 goto error;
450 }
451
452 if (groups == 0) {
453 xnn_log_error(
454 "failed to create Convolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
455 goto error;
456 }
457
458 if (group_input_channels == 0) {
459 xnn_log_error(
460 "failed to create Convolution operator with %zu input channels per group: "
461 "number of channels must be non-zero",
462 group_input_channels);
463 goto error;
464 }
465
466 if (group_output_channels == 0) {
467 xnn_log_error(
468 "failed to create Convolution operator with %zu output channels per group: "
469 "number of channels must be non-zero",
470 group_output_channels);
471 goto error;
472 }
473
474 const size_t input_channels = groups * group_input_channels;
475 if (input_pixel_stride < input_channels) {
476 xnn_log_error(
477 "failed to create Convolution operator with input pixel stride of %zu: "
478 "stride must be at least as large as the number of input channels (%" PRIu32 "x%zu)",
479 input_pixel_stride, groups, group_input_channels);
480 goto error;
481 }
482
483 const size_t output_channels = groups * group_output_channels;
484 if (output_pixel_stride < output_channels) {
485 xnn_log_error(
486 "failed to create Convolution operator with output pixel stride of %zu: "
487 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
488 output_pixel_stride, groups, group_output_channels);
489 goto error;
490 }
491
492 if (isnan(output_min)) {
493 xnn_log_error(
494 "failed to create Convolution operator with NaN output lower bound: lower bound must be non-NaN");
495 goto error;
496 }
497
498 if (isnan(output_max)) {
499 xnn_log_error(
500 "failed to create Convolution operator with NaN output upper bound: upper bound must be non-NaN");
501 goto error;
502 }
503
504 if (output_min >= output_max) {
505 xnn_log_error(
506 "failed to create Convolution operator with [%.7g, %.7g] output range: "
507 "lower bound must be below upper bound",
508 output_min, output_max);
509 goto error;
510 }
511
Marat Dukhandd69f0b2019-10-04 19:40:03 -0700512 if ((flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) != 0 && group_input_channels != 1) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700513 xnn_log_error(
514 "failed to create Depthwise Convolution operator with %zu input channels per group: "
515 "Depthwise Convolution must have exactly 1 input channel per group",
516 group_input_channels);
517 goto error;
518 }
519
Marat Dukhan8440fde2019-10-24 12:46:13 -0700520 const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
521 if ((flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0) {
522 if (any_padding) {
523 xnn_log_error(
524 "failed to create Convolution operator with %" PRIu32 "+%" PRIu32 "x%" PRIu32 "+%" PRIu32" padding: "
525 "TensorFlow SAME padding can't be combined with explicit padding specification",
526 input_padding_top, input_padding_left, input_padding_bottom, input_padding_right);
527 goto error;
528 }
529 }
530
XNNPACK Teamb455b122019-09-27 18:10:33 -0700531 status = xnn_status_out_of_memory;
532
Marat Dukhan04f03be2019-11-19 12:36:47 -0800533 convolution_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
XNNPACK Teamb455b122019-09-27 18:10:33 -0700534 if (convolution_op == NULL) {
535 xnn_log_error("failed to allocate %zu bytes for Convolution operator descriptor", sizeof(struct xnn_operator));
536 goto error;
537 }
538
539 const size_t kernel_size = kernel_height * kernel_width;
540
541 enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_none;
542 const struct dwconv_parameters* dwconv_parameters = NULL;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700543 const bool unit_subsampling = (subsampling_width | subsampling_height) == 1;
544 if (group_input_channels == 1 && group_output_channels == 1 && kernel_size == 1 && unit_subsampling && !any_padding) {
545 ukernel_type = xnn_ukernel_type_vmulcaddc;
546 } else if (group_input_channels == 1 && group_output_channels == 1 && (dwconv_parameters =
547 find_dwigemm_ukernel(kernel_size, xnn_params.f32.dwconv, XNN_MAX_F32_DWCONV_UKERNELS)) != NULL)
548 {
549 ukernel_type = xnn_ukernel_type_dwconv;
550 } else if (kernel_size == 1 && unit_subsampling && !any_padding) {
551 ukernel_type = xnn_ukernel_type_gemm;
552 } else {
553 ukernel_type = xnn_ukernel_type_igemm;
554 }
Marat Dukhan869c62d2020-04-09 17:17:55 -0700555 const bool linear_activation = (output_max == INFINITY) && (output_min == -output_max);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700556
557 size_t zero_size = 0;
558 switch (ukernel_type) {
559 case xnn_ukernel_type_vmulcaddc:
560 {
Marat Dukhan0b10b9f2020-04-15 18:55:09 -0700561 const size_t c_stride = round_up_po2(groups, xnn_params.f32.vmulcaddc.channel_tile);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700562 const size_t packed_weights_size = 2 * sizeof(float) * c_stride;
Marat Dukhan04f03be2019-11-19 12:36:47 -0800563 convolution_op->packed_weights = xnn_allocate_simd_memory(packed_weights_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700564 if (convolution_op->packed_weights == NULL) {
565 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_weights_size);
566 goto error;
567 }
568
569 xnn_pack_f32_vmulcaddc_w(
Marat Dukhan49e6ee92019-11-06 15:55:29 -0800570 groups, xnn_params.f32.vmulcaddc.channel_tile,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700571 kernel, bias, convolution_op->packed_weights);
572
573 convolution_op->ukernel.vmulcaddc = (struct xnn_ukernel_vmulcaddc) {
574 .function = xnn_params.f32.vmulcaddc.ukernel,
Marat Dukhan49e6ee92019-11-06 15:55:29 -0800575 .mr = xnn_params.f32.vmulcaddc.row_tile,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700576 };
577 break;
578 }
579 case xnn_ukernel_type_dwconv:
580 {
581 assert(dwconv_parameters != NULL);
Marat Dukhanaefaef32020-04-09 07:09:34 -0700582 assert(dwconv_parameters->primary_tile == kernel_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700583
Marat Dukhan0b10b9f2020-04-15 18:55:09 -0700584 const size_t c_stride = round_up_po2(groups, dwconv_parameters->channel_tile);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700585 const size_t packed_weights_size = (kernel_size + 1) * sizeof(float) * c_stride;
Marat Dukhan04f03be2019-11-19 12:36:47 -0800586 convolution_op->packed_weights = xnn_allocate_simd_memory(packed_weights_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700587 if (convolution_op->packed_weights == NULL) {
588 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_weights_size);
589 goto error;
590 }
591
Marat Dukhandd69f0b2019-10-04 19:40:03 -0700592 if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700593 xnn_pack_f32_dwconv_hwg_w(
594 kernel_height, kernel_width,
Marat Dukhanaefaef32020-04-09 07:09:34 -0700595 groups, dwconv_parameters->channel_tile,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700596 kernel, bias, convolution_op->packed_weights);
597 } else {
598 xnn_pack_f32_dwconv_ghw_w(
599 kernel_height, kernel_width,
Marat Dukhanaefaef32020-04-09 07:09:34 -0700600 groups, dwconv_parameters->channel_tile,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700601 kernel, bias, convolution_op->packed_weights);
602 }
603
Marat Dukhan869c62d2020-04-09 17:17:55 -0700604 const union dwconv_fused_ukernels* ukernels = &dwconv_parameters->minmax;
605 if (linear_activation && dwconv_parameters->linear.unipass != NULL) {
606 ukernels = &dwconv_parameters->linear;
607 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700608 convolution_op->ukernel.dwconv = (struct xnn_ukernel_dwconv) {
Marat Dukhan869c62d2020-04-09 17:17:55 -0700609 .unipass_function = ukernels->unipass,
Marat Dukhanaefaef32020-04-09 07:09:34 -0700610 .primary_tile = dwconv_parameters->primary_tile,
611 .incremental_tile = dwconv_parameters->incremental_tile,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700612 };
613
614 zero_size = sizeof(float) * c_stride;
615 break;
616 }
617 case xnn_ukernel_type_gemm:
618 case xnn_ukernel_type_igemm:
619 {
620 const uint32_t nr = xnn_params.f32.gemm.nr;
621 const uint32_t kr = UINT32_C(1) << xnn_params.f32.gemm.log2_kr;
622 const uint32_t sr = UINT32_C(1) << xnn_params.f32.gemm.log2_sr;
Marat Dukhan0b10b9f2020-04-15 18:55:09 -0700623 const size_t n_stride = round_up(group_output_channels, nr);
624 const size_t k_stride = round_up_po2(group_input_channels, kr);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700625
626 const size_t packed_group_weights_size = (kernel_size * k_stride + 1) * sizeof(float) * n_stride;
Marat Dukhan04f03be2019-11-19 12:36:47 -0800627 convolution_op->packed_weights = xnn_allocate_simd_memory(packed_group_weights_size * groups);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700628 if (convolution_op->packed_weights == NULL) {
629 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
630 goto error;
631 }
632 memset(convolution_op->packed_weights, 0, packed_group_weights_size * groups);
633
Marat Dukhan869c62d2020-04-09 17:17:55 -0700634 const struct gemm_fused_ukernels* ukernels = &xnn_params.f32.gemm.minmax;
635 if (linear_activation && xnn_params.f32.gemm.linear.gemm.function[XNN_UARCH_DEFAULT] != NULL) {
636 ukernels = &xnn_params.f32.gemm.linear;
637 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700638 switch (ukernel_type) {
639 case xnn_ukernel_type_gemm:
640 xnn_pack_f32_gemm_goi_w(
641 groups, group_output_channels, group_input_channels,
642 nr, kr, sr,
643 kernel, bias, convolution_op->packed_weights);
644 convolution_op->ukernel.gemm = (struct xnn_ukernel_gemm) {
645 .mr = xnn_params.f32.gemm.mr,
646 .nr = nr,
647 .kr = kr,
Marat Dukhan869c62d2020-04-09 17:17:55 -0700648 .general_case = ukernels->gemm,
649 .mr1_case = ukernels->gemm1,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700650 };
651 break;
652 case xnn_ukernel_type_igemm:
Marat Dukhandd69f0b2019-10-04 19:40:03 -0700653 if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700654 xnn_pack_f32_conv_kgo_w(
655 groups, group_output_channels, kernel_size,
656 nr, kr,
657 kernel, bias, convolution_op->packed_weights);
658 } else {
659 xnn_pack_f32_conv_goki_w(
660 groups, group_output_channels, kernel_size, group_input_channels,
661 nr, kr, sr,
662 kernel, bias, convolution_op->packed_weights);
663 }
664 convolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
665 .mr = xnn_params.f32.gemm.mr,
666 .nr = nr,
667 .kr = kr,
Marat Dukhan869c62d2020-04-09 17:17:55 -0700668 .general_case = ukernels->igemm,
669 .mr1_case = ukernels->igemm1,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700670 };
671 break;
672 default:
673 XNN_UNREACHABLE;
674 }
675
676 zero_size = sizeof(float) * k_stride;
677 break;
678 }
679 default:
680 XNN_UNREACHABLE;
681 }
682
Marat Dukhan8440fde2019-10-24 12:46:13 -0700683 const bool tf_same_padding = (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0 && kernel_size != 1;
684 if (any_padding || tf_same_padding) {
Marat Dukhan04f03be2019-11-19 12:36:47 -0800685 void* zero_buffer = xnn_allocate_zero_simd_memory(zero_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700686 if (zero_buffer == NULL) {
687 xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
688 goto error;
689 }
690 convolution_op->zero_buffer = zero_buffer;
691 }
692
693 convolution_op->padding_top = input_padding_top;
694 convolution_op->padding_right = input_padding_right;
695 convolution_op->padding_bottom = input_padding_bottom;
696 convolution_op->padding_left = input_padding_left;
697
698 convolution_op->kernel_height = kernel_height;
699 convolution_op->kernel_width = kernel_width;
700 convolution_op->stride_height = subsampling_height;
701 convolution_op->stride_width = subsampling_width;
702 convolution_op->dilation_height = dilation_height;
703 convolution_op->dilation_width = dilation_width;
704 convolution_op->groups = groups;
705 convolution_op->group_input_channels = group_input_channels;
706 convolution_op->group_output_channels = group_output_channels;
707 convolution_op->input_pixel_stride = input_pixel_stride;
708 convolution_op->output_pixel_stride = output_pixel_stride;
709
Marat Dukhaneb09a6b2020-04-08 17:34:32 -0700710 convolution_op->f32_minmax_params = xnn_init_f32_minmax_params(output_min, output_max);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700711
Marat Dukhanefc47b82019-11-18 09:25:38 -0800712 convolution_op->type = xnn_operator_type_convolution_nhwc_f32;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700713 convolution_op->ukernel.type = ukernel_type;
Marat Dukhan8440fde2019-10-24 12:46:13 -0700714 if (tf_same_padding) {
715 convolution_op->flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING;
716 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700717
718 convolution_op->state = xnn_run_state_invalid;
719
720 *convolution_op_out = convolution_op;
721 return xnn_status_success;
722
723error:
724 xnn_delete_operator(convolution_op);
725 return status;
726}
727
728static enum xnn_status setup_convolution2d_nhwc(
729 xnn_operator_t convolution_op,
730 size_t batch_size,
731 size_t input_height,
732 size_t input_width,
733 const void* input,
734 void* output,
735 uint32_t log2_input_element_size,
736 uint32_t log2_filter_element_size,
737 uint32_t bias_element_size,
738 uint32_t log2_output_element_size,
739 const void* params,
740 size_t num_threads)
741{
742 convolution_op->state = xnn_run_state_invalid;
743
744 if (!xnn_params.initialized) {
745 xnn_log_error("failed to setup Convolution operator: XNNPACK is not initialized");
746 return xnn_status_uninitialized;
747 }
748
749 if (input_width == 0 || input_height == 0) {
750 xnn_log_error(
751 "failed to setup Convolution operator with %zux%zu input: input dimensions must be non-zero",
752 input_width, input_height);
753 return xnn_status_invalid_parameter;
754 }
755
756 if (batch_size == 0) {
757 convolution_op->state = xnn_run_state_skip;
758 return xnn_status_success;
759 }
760
761 convolution_op->batch_size = batch_size;
762 convolution_op->input_height = input_height;
763 convolution_op->input_width = input_width;
764 convolution_op->input = input;
765
Marat Dukhan8440fde2019-10-24 12:46:13 -0700766 if (convolution_op->flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) {
767 convolution_op->output_height = compute_output_dimension_with_tf_same_padding(
768 input_height, convolution_op->stride_height);
769 convolution_op->output_width = compute_output_dimension_with_tf_same_padding(
770 input_width, convolution_op->stride_width);
771
772 const uint32_t effective_kernel_height = (convolution_op->kernel_height - 1) * convolution_op->dilation_height + 1;
773 const uint32_t effective_kernel_width = (convolution_op->kernel_width - 1) * convolution_op->dilation_width + 1;
Marat Dukhan0b10b9f2020-04-15 18:55:09 -0700774 const size_t total_padding_height =
Marat Dukhan8440fde2019-10-24 12:46:13 -0700775 (convolution_op->output_height - 1) * convolution_op->stride_height + effective_kernel_height - input_height;
Marat Dukhan0b10b9f2020-04-15 18:55:09 -0700776 const size_t total_padding_width =
Marat Dukhan8440fde2019-10-24 12:46:13 -0700777 (convolution_op->output_width - 1) * convolution_op->stride_width + effective_kernel_width - input_width;
778 convolution_op->padding_top = total_padding_height / 2;
779 convolution_op->padding_left = total_padding_width / 2;
780 convolution_op->padding_bottom = total_padding_height - convolution_op->padding_top;
781 convolution_op->padding_right = total_padding_width - convolution_op->padding_left;
782 } else {
783 convolution_op->output_height = compute_output_dimension(
784 convolution_op->padding_top + input_height + convolution_op->padding_bottom,
785 convolution_op->kernel_height,
786 convolution_op->dilation_height,
787 convolution_op->stride_height);
788 convolution_op->output_width = compute_output_dimension(
789 convolution_op->padding_left + input_width + convolution_op->padding_right,
790 convolution_op->kernel_width,
791 convolution_op->dilation_width,
792 convolution_op->stride_width);
793 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700794 convolution_op->output = output;
795
796 switch (convolution_op->ukernel.type) {
797 case xnn_ukernel_type_gemm:
798 {
799 // Convolution maps directly to GEMM and doesn't use indirection buffer.
800
801 const size_t output_height = convolution_op->output_height;
802 const size_t output_width = convolution_op->output_width;
803 const size_t output_size = output_height * output_width;
804 const size_t batch_output_size = batch_size * output_size;
805
806 const size_t groups = convolution_op->groups;
807 const size_t group_input_channels = convolution_op->group_input_channels;
808 const size_t w_stride = (round_up_po2(group_input_channels, convolution_op->ukernel.gemm.kr) << log2_filter_element_size) + bias_element_size;
809 const size_t group_output_channels = convolution_op->group_output_channels;
810
811 uint32_t mr = convolution_op->ukernel.gemm.mr;
812 const uint32_t nr = convolution_op->ukernel.gemm.nr;
Marat Dukhan05702cf2020-03-26 15:41:33 -0700813 struct xnn_hmp_gemm_ukernel gemm_ukernel = convolution_op->ukernel.gemm.general_case;
814 if (batch_output_size == 1 && convolution_op->ukernel.gemm.mr1_case.function[XNN_UARCH_DEFAULT] != NULL) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700815 mr = 1;
Marat Dukhan05702cf2020-03-26 15:41:33 -0700816 gemm_ukernel = convolution_op->ukernel.gemm.mr1_case;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700817 }
818
819 convolution_op->context.gemm = (struct gemm_context) {
820 .k_scaled = group_input_channels << log2_input_element_size,
821 .a = input,
822 .a_stride = convolution_op->input_pixel_stride << log2_input_element_size,
823 .packed_w = convolution_op->packed_weights,
824 .w_stride = w_stride,
825 .wg_stride = w_stride * round_up(group_output_channels, nr),
826 .c = output,
827 .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
828 .cn_stride = nr << log2_output_element_size,
829 .cg_stride = group_output_channels << log2_output_element_size,
830 .log2_csize = log2_output_element_size,
831 .ukernel = gemm_ukernel,
832 };
833 memcpy(&convolution_op->context.gemm.params, params, sizeof(convolution_op->context.gemm.params));
834
835 size_t nc = group_output_channels;
836 if (num_threads > 1) {
837 const size_t num_other_tiles = groups * divide_round_up(batch_output_size, mr);
838 const size_t target_tiles_per_thread = 5;
839 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
840 if (max_nc < nc) {
841 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
842 }
843 }
844 if (groups == 1) {
Marat Dukhan05702cf2020-03-26 15:41:33 -0700845 #if XNN_MAX_UARCH_TYPES > 1
846 if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) {
847 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d_with_uarch;
848 convolution_op->compute.task_2d_tile_2d_with_id = (pthreadpool_task_2d_tile_2d_with_id_t) xnn_compute_hmp_gemm;
849 } else {
850 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
851 convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
852 }
853 #else
854 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
855 convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
856 #endif
XNNPACK Teamb455b122019-09-27 18:10:33 -0700857 convolution_op->compute.range[0] = batch_output_size;
858 convolution_op->compute.range[1] = group_output_channels;
859 convolution_op->compute.tile[0] = mr;
860 convolution_op->compute.tile[1] = nc;
861 } else {
Marat Dukhan05702cf2020-03-26 15:41:33 -0700862 #if XNN_MAX_UARCH_TYPES > 1
863 if (xnn_is_hmp_gemm_ukernel(gemm_ukernel)) {
864 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
865 convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_grouped_gemm;
866 } else {
867 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
868 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm;
869 }
870 #else
871 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
872 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_grouped_gemm;
873 #endif
XNNPACK Teamb455b122019-09-27 18:10:33 -0700874 convolution_op->compute.range[0] = groups;
875 convolution_op->compute.range[1] = batch_output_size;
876 convolution_op->compute.range[2] = group_output_channels;
877 convolution_op->compute.tile[0] = mr;
878 convolution_op->compute.tile[1] = nc;
879 }
880 convolution_op->state = xnn_run_state_ready;
881
882 return xnn_status_success;
883 }
884 case xnn_ukernel_type_igemm:
885 {
886 const size_t groups = convolution_op->groups;
887 const size_t kernel_height = convolution_op->kernel_height;
888 const size_t kernel_width = convolution_op->kernel_width;
889 const size_t kernel_size = kernel_height * kernel_width;
890 const size_t output_height = convolution_op->output_height;
891 const size_t output_width = convolution_op->output_width;
892 const size_t output_size = output_height * output_width;
893
894 uint32_t mr = convolution_op->ukernel.igemm.mr;
895 const uint32_t nr = convolution_op->ukernel.igemm.nr;
Marat Dukhan05702cf2020-03-26 15:41:33 -0700896 struct xnn_hmp_igemm_ukernel igemm_ukernel = convolution_op->ukernel.igemm.general_case;
897 if (output_size == 1 && convolution_op->ukernel.igemm.mr1_case.function[XNN_UARCH_DEFAULT] != NULL) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700898 mr = 1;
Marat Dukhan05702cf2020-03-26 15:41:33 -0700899 igemm_ukernel = convolution_op->ukernel.igemm.mr1_case;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700900 }
901
902 const size_t tiled_output_size = round_up(output_size, mr);
903 const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
904
905 if (input_height != convolution_op->last_input_height ||
906 input_width != convolution_op->last_input_width)
907 {
Marat Dukhan0b10b9f2020-04-15 18:55:09 -0700908 const void** indirection_buffer = (const void**) xnn_reallocate_memory((void*) convolution_op->indirection_buffer, indirection_buffer_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700909 if (indirection_buffer == NULL) {
910 xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
911 return xnn_status_out_of_memory;
912 }
913 convolution_op->indirection_buffer = indirection_buffer;
914 convolution_op->last_input = input;
915 convolution_op->last_input_height = input_height;
916 convolution_op->last_input_width = input_width;
917
918 xnn_indirection_init_conv2d(convolution_op, mr, log2_input_element_size);
919 }
920
921 const size_t group_input_channels = convolution_op->group_input_channels;
922 const size_t w_stride = (round_up_po2(group_input_channels, convolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size) + bias_element_size;
923 const size_t group_output_channels = convolution_op->group_output_channels;
924 convolution_op->context.igemm = (struct igemm_context) {
925 .ks = kernel_size,
926 .ks_scaled = kernel_size * mr * sizeof(void*),
927 .kc = group_input_channels << log2_input_element_size,
928 .w_stride = w_stride,
929 .indirect_a = convolution_op->indirection_buffer,
930 .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) convolution_op->last_input),
931 .zero = convolution_op->zero_buffer,
932 .packed_w = convolution_op->packed_weights,
933 .c = convolution_op->output,
934 .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
935 .cn_stride = nr << log2_output_element_size,
936 .ga_stride = group_input_channels << log2_input_element_size,
937 .gw_stride = w_stride * round_up(group_output_channels, nr),
938 .gc_stride = group_output_channels << log2_output_element_size,
939 .ba_stride = input_height * input_width * convolution_op->input_pixel_stride << log2_input_element_size,
940 .bc_stride = output_size * convolution_op->output_pixel_stride << log2_output_element_size,
941 .log2_csize = log2_output_element_size,
942 .ukernel = igemm_ukernel,
943 };
944 memcpy(&convolution_op->context.igemm.params, params, sizeof(convolution_op->context.igemm.params));
945
946 size_t nc = group_output_channels;
947 if (num_threads > 1) {
948 const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
949 const size_t target_tiles_per_thread = 5;
950 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
951 if (max_nc < nc) {
952 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
953 }
954 }
955 if (groups == 1) {
Marat Dukhan05702cf2020-03-26 15:41:33 -0700956 #if XNN_MAX_UARCH_TYPES > 1
957 if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) {
958 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d_with_uarch;
959 convolution_op->compute.task_3d_tile_2d_with_id = (pthreadpool_task_3d_tile_2d_with_id_t) xnn_compute_hmp_igemm;
960 } else {
961 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
962 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_igemm;
963 }
964 #else
965 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
966 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_igemm;
967 #endif
XNNPACK Teamb455b122019-09-27 18:10:33 -0700968 convolution_op->compute.range[0] = batch_size;
969 convolution_op->compute.range[1] = output_size;
970 convolution_op->compute.range[2] = group_output_channels;
971 convolution_op->compute.tile[0] = mr;
972 convolution_op->compute.tile[1] = nc;
973 } else {
Marat Dukhan05702cf2020-03-26 15:41:33 -0700974 #if XNN_MAX_UARCH_TYPES > 1
975 if (xnn_is_hmp_igemm_ukernel(igemm_ukernel)) {
976 convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d_with_uarch;
977 convolution_op->compute.task_4d_tile_2d_with_id = (pthreadpool_task_4d_tile_2d_with_id_t) xnn_compute_hmp_grouped_igemm;
978 } else {
979 convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
980 convolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_igemm;
981 }
982 #else
983 convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
984 convolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_grouped_igemm;
985 #endif
XNNPACK Teamb455b122019-09-27 18:10:33 -0700986 convolution_op->compute.range[0] = batch_size;
987 convolution_op->compute.range[1] = groups;
988 convolution_op->compute.range[2] = output_size;
989 convolution_op->compute.range[3] = group_output_channels;
990 convolution_op->compute.tile[0] = mr;
991 convolution_op->compute.tile[1] = nc;
992 }
993 convolution_op->state = xnn_run_state_ready;
994
995 return xnn_status_success;
996 }
997 case xnn_ukernel_type_dwconv:
998 {
999 size_t valid_batch_size = 0;
1000 if (input == convolution_op->last_input &&
1001 input_height == convolution_op->last_input_height &&
1002 input_width == convolution_op->last_input_width)
1003 {
1004 valid_batch_size = convolution_op->valid_batch_size;
1005 if (batch_size <= valid_batch_size) {
1006 convolution_op->compute.range[0] = batch_size * convolution_op->output_height;
Marat Dukhan866f7b32019-11-20 16:39:26 -08001007 convolution_op->context.dwconv.output = output;
XNNPACK Teamb455b122019-09-27 18:10:33 -07001008 convolution_op->state = xnn_run_state_ready;
1009 return xnn_status_success;
1010 }
1011 }
1012
1013 const size_t kernel_height = convolution_op->kernel_height;
1014 const size_t kernel_width = convolution_op->kernel_width;
1015 const size_t kernel_size = kernel_height * kernel_width;
1016 const size_t output_height = convolution_op->output_height;
1017 const size_t output_width = convolution_op->output_width;
1018 const size_t step_width = convolution_op->dilation_width == 1 ? convolution_op->stride_width : kernel_width;
Marat Dukhan03ff2942019-12-05 09:32:26 -08001019 const size_t step_height = kernel_size + (output_width - 1) * step_width * kernel_height;
XNNPACK Teamb455b122019-09-27 18:10:33 -07001020 const size_t indirection_buffer_size = sizeof(void*) * batch_size * output_height * step_height;
1021
1022 const void** indirection_buffer =
Marat Dukhan0b10b9f2020-04-15 18:55:09 -07001023 (const void**) xnn_reallocate_memory((void*) convolution_op->indirection_buffer, indirection_buffer_size);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001024 if (indirection_buffer == NULL) {
1025 xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
1026 return xnn_status_out_of_memory;
1027 }
1028 convolution_op->indirection_buffer = indirection_buffer;
1029
1030 xnn_indirection_init_dwconv2d(convolution_op, valid_batch_size, step_height, step_width, log2_input_element_size);
1031
1032 const size_t groups = convolution_op->groups;
1033 convolution_op->context.dwconv = (struct dwconv_context) {
1034 .groups = groups,
1035 .indirection_buffer = convolution_op->indirection_buffer,
1036 .indirection_buffer_row_stride = step_height,
1037 .indirection_buffer_col_stride = kernel_height * step_width * sizeof(void*),
1038 .packed_weights = convolution_op->packed_weights,
1039 .output = convolution_op->output,
1040 .output_width = output_width,
1041 .output_row_stride = output_width * convolution_op->output_pixel_stride << log2_output_element_size,
1042 .output_col_increment = (convolution_op->output_pixel_stride - groups) << log2_output_element_size,
1043 .unipass_ukernel = convolution_op->ukernel.dwconv.unipass_function,
1044 };
1045 memcpy(&convolution_op->context.dwconv.params, params, sizeof(convolution_op->context.dwconv.params));
1046
1047 convolution_op->compute.type = xnn_parallelization_type_1d;
1048 convolution_op->compute.task_1d = (pthreadpool_task_1d_t) xnn_compute_dwconv_unipass;
1049 convolution_op->compute.range[0] = batch_size * output_height;
1050 convolution_op->state = xnn_run_state_ready;
1051
1052 convolution_op->last_input = input;
1053 convolution_op->last_input_height = input_height;
1054 convolution_op->last_input_width = input_width;
1055 convolution_op->valid_batch_size = max(valid_batch_size, batch_size);
1056
1057 return xnn_status_success;
1058 }
1059 case xnn_ukernel_type_vmulcaddc:
1060 {
1061 const size_t batch_output_size = batch_size * convolution_op->output_height * convolution_op->output_width;
1062
1063 convolution_op->context.vmulcaddc = (struct vmulcaddc_context) {
1064 .n = convolution_op->groups << log2_input_element_size,
1065 .x = input,
1066 .x_stride = convolution_op->input_pixel_stride << log2_input_element_size,
1067 .w = convolution_op->packed_weights,
1068 .y = output,
1069 .y_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1070 .ukernel = convolution_op->ukernel.vmulcaddc.function,
1071 };
1072 memcpy(&convolution_op->context.vmulcaddc.params, params, sizeof(convolution_op->context.vmulcaddc.params));
1073
1074 size_t mc = batch_output_size;
1075 if (num_threads > 1) {
1076 const size_t target_tiles_per_thread = 5;
1077 const size_t max_mc = divide_round_up(batch_output_size, num_threads * target_tiles_per_thread);
1078 if (max_mc < mc) {
1079 const uint32_t mr = convolution_op->ukernel.vmulcaddc.mr;
1080 mc = min(mc, divide_round_up(mc, max_mc * mr) * mr);
1081 }
1082 }
1083 convolution_op->compute.type = xnn_parallelization_type_1d_tile_1d;
1084 convolution_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_vmulcaddc;
1085 convolution_op->compute.range[0] = batch_output_size;
1086 convolution_op->compute.tile[0] = mc;
1087 convolution_op->state = xnn_run_state_ready;
1088
1089 return xnn_status_success;
1090 }
1091 default:
1092 XNN_UNREACHABLE;
1093 }
1094}
1095
1096enum xnn_status xnn_setup_convolution2d_nhwc_q8(
1097 xnn_operator_t convolution_op,
1098 size_t batch_size,
1099 size_t input_height,
1100 size_t input_width,
1101 const uint8_t* input,
1102 uint8_t* output,
1103 pthreadpool_t threadpool)
1104{
Marat Dukhanefc47b82019-11-18 09:25:38 -08001105 if (convolution_op->type != xnn_operator_type_convolution_nhwc_q8) {
1106 xnn_log_error("failed to setup Convolution (NHWC, Q8) operator: operator type mismatch");
XNNPACK Teamb455b122019-09-27 18:10:33 -07001107 return xnn_status_invalid_parameter;
1108 }
1109
1110 return setup_convolution2d_nhwc(
1111 convolution_op,
1112 batch_size, input_height, input_width,
1113 input, output,
1114 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
1115 0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
1116 sizeof(int32_t) /* sizeof(bias element) */,
1117 0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
1118 &convolution_op->q8_gemm_params,
1119 pthreadpool_get_threads_count(threadpool));
1120}
1121
1122enum xnn_status xnn_setup_convolution2d_nhwc_f32(
1123 xnn_operator_t convolution_op,
1124 size_t batch_size,
1125 size_t input_height,
1126 size_t input_width,
1127 const float* input,
1128 float* output,
1129 pthreadpool_t threadpool)
1130{
Marat Dukhanefc47b82019-11-18 09:25:38 -08001131 if (convolution_op->type != xnn_operator_type_convolution_nhwc_f32) {
1132 xnn_log_error("failed to setup Convolution (NHWC, F32) operator: operator type mismatch");
XNNPACK Teamb455b122019-09-27 18:10:33 -07001133 return xnn_status_invalid_parameter;
1134 }
1135
1136 return setup_convolution2d_nhwc(
1137 convolution_op,
1138 batch_size, input_height, input_width,
1139 input, output,
1140 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1141 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1142 sizeof(float) /* sizeof(bias element) */,
1143 2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
Marat Dukhaneb09a6b2020-04-08 17:34:32 -07001144 &convolution_op->f32_minmax_params,
XNNPACK Teamb455b122019-09-27 18:10:33 -07001145 pthreadpool_get_threads_count(threadpool));
1146}