blob: d9307f68a917335c8fd4f11b43a1396524f2d875 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <assert.h>
10#include <math.h>
11#include <stdbool.h>
12#include <stddef.h>
13#include <stdint.h>
14#include <stdlib.h>
15#include <string.h>
16
17#include <xnnpack.h>
18#include <xnnpack/allocator.h>
19#include <xnnpack/operator.h>
20#include <xnnpack/log.h>
21#include <xnnpack/common.h>
22#include <xnnpack/compute.h>
23#include <xnnpack/math.h>
24#include <xnnpack/pack.h>
25#include <xnnpack/params.h>
26#include <xnnpack/indirection.h>
27
28
29static inline size_t compute_output_dimension(
30 size_t padded_input_dimension,
31 size_t kernel_dimension,
32 size_t dilation_dimension,
33 size_t subsampling_dimension)
34{
35 const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
36 return doz(padded_input_dimension, effective_kernel_dimension) / subsampling_dimension + 1;
37}
38
39static const struct dwconv_parameters* find_dwigemm_ukernel(
40 size_t kernel_size,
41 const struct dwconv_parameters* ukernel,
42 size_t num_ukernels)
43{
44 while (num_ukernels-- != 0) {
45 if (ukernel->mr == kernel_size) {
46 return ukernel;
47 }
48 ukernel++;
49 }
50 return NULL;
51}
52
53enum xnn_status xnn_create_convolution2d_nhwc_q8(
54 uint32_t input_padding_top,
55 uint32_t input_padding_right,
56 uint32_t input_padding_bottom,
57 uint32_t input_padding_left,
58 uint32_t kernel_height,
59 uint32_t kernel_width,
60 uint32_t subsampling_height,
61 uint32_t subsampling_width,
62 uint32_t dilation_height,
63 uint32_t dilation_width,
64 uint32_t groups,
65 size_t group_input_channels,
66 size_t group_output_channels,
67 size_t input_pixel_stride,
68 size_t output_pixel_stride,
69 uint8_t input_zero_point,
70 float input_scale,
71 uint8_t kernel_zero_point,
72 float kernel_scale,
73 const uint8_t* kernel,
74 const int32_t* bias,
75 uint8_t output_zero_point,
76 float output_scale,
77 uint8_t output_min,
78 uint8_t output_max,
79 uint32_t flags,
80 xnn_operator_t* convolution_op_out)
81{
82 xnn_operator_t convolution_op = NULL;
83 enum xnn_status status = xnn_status_uninitialized;
84
85 if (!xnn_params.initialized) {
86 xnn_log_error("failed to create Convolution operator: XNNPACK is not initialized");
87 goto error;
88 }
89
90 status = xnn_status_invalid_parameter;
91
92 if (kernel_width == 0 || kernel_height == 0) {
93 xnn_log_error(
94 "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
95 kernel_width, kernel_height);
96 goto error;
97 }
98
99 if (subsampling_width == 0 || subsampling_height == 0) {
100 xnn_log_error(
101 "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " subsampling: "
102 "subsampling dimensions must be non-zero",
103 subsampling_width, subsampling_height);
104 goto error;
105 }
106
107 if (dilation_width == 0 || dilation_height == 0) {
108 xnn_log_error(
109 "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
110 "dilation dimensions must be non-zero",
111 dilation_width, dilation_height);
112 goto error;
113 }
114
115 if (groups == 0) {
116 xnn_log_error(
117 "failed to create Convolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
118 goto error;
119 }
120
121 if (group_input_channels == 0) {
122 xnn_log_error(
123 "failed to create Convolution operator with %zu input channels per group: "
124 "number of channels must be non-zero",
125 group_input_channels);
126 goto error;
127 }
128
129 if (group_output_channels == 0) {
130 xnn_log_error(
131 "failed to create Convolution operator with %zu output channels per group: "
132 "number of channels must be non-zero",
133 group_output_channels);
134 goto error;
135 }
136
137 const size_t input_channels = groups * group_input_channels;
138 if (input_pixel_stride < input_channels) {
139 xnn_log_error(
140 "failed to create Convolution operator with input pixel stride of %zu: "
141 "stride must be at least as large as the number of input channels (%" PRIu32 "x%zu)",
142 input_pixel_stride, groups, group_input_channels);
143 goto error;
144 }
145
146 const size_t output_channels = groups * group_output_channels;
147 if (output_pixel_stride < output_channels) {
148 xnn_log_error(
149 "failed to create Convolution operator with output pixel stride of %zu: "
150 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
151 output_pixel_stride, groups, group_output_channels);
152 goto error;
153 }
154
155 if (input_scale <= 0.0f || !isnormal(input_scale)) {
156 xnn_log_error(
157 "failed to create Convolution operator with %.7g input scale: scale must be finite, normalized, and positive",
158 input_scale);
159 goto error;
160 }
161
162 if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
163 xnn_log_error(
164 "failed to create Convolution operator with %.7g kernel scale: scale must be finite, normalized, and positive",
165 kernel_scale);
166 goto error;
167 }
168
169 if (output_scale <= 0.0f || !isnormal(output_scale)) {
170 xnn_log_error(
171 "failed to create Convolution operator with %.7g output scale: scale must be finite, normalized, and positive",
172 output_scale);
173 goto error;
174 }
175
176 if (output_min >= output_max) {
177 xnn_log_error(
178 "failed to create Convolution operator with [%" PRIu8 ", %" PRIu8 "] output range: "
179 "range min must be below range max",
180 output_min, output_max);
181 goto error;
182 }
183
Marat Dukhandd69f0b2019-10-04 19:40:03 -0700184 if ((flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) != 0 && group_input_channels != 1) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700185 xnn_log_error(
186 "failed to create Depthwise Convolution operator with %zu input channels per group: "
187 "Depthwise Convolution must have exactly 1 input channel per group",
188 group_input_channels);
189 goto error;
190 }
191
192 status = xnn_status_unsupported_parameter;
193
194 const uint32_t effective_kernel_height = (kernel_height - 1) * dilation_height + 1;
195 const uint32_t effective_kernel_width = (kernel_width - 1) * dilation_width + 1;
196
197 if (input_padding_top >= effective_kernel_height) {
198 xnn_log_info(
199 "inefficiency in Convolution operator with %" PRIu32 "x%" PRIu32 " effective kernel and %" PRIu32 "+%" PRIu32 " height padding: "
200 "input top padding is greater or equal to effective kernel height",
201 effective_kernel_width, effective_kernel_height, input_padding_top, input_padding_bottom);
202 }
203
204 if (input_padding_bottom >= effective_kernel_height) {
205 xnn_log_info(
206 "inefficiency in Convolution operator with %" PRIu32 "x%" PRIu32 " effective kernel and %" PRIu32 "+%" PRIu32 " height padding: "
207 "input bottom padding is greater or equal to effective kernel height",
208 effective_kernel_width, effective_kernel_height, input_padding_top, input_padding_bottom);
209 }
210
211 if (input_padding_right >= effective_kernel_width) {
212 xnn_log_info(
213 "inefficiency in Convolution operator with %" PRIu32 "x%" PRIu32 " effective kernel and %" PRIu32 "+%" PRIu32 " width padding: "
214 "input right padding is greater or equal to effective kernel width",
215 effective_kernel_width, effective_kernel_height, input_padding_left, input_padding_right);
216 }
217
218 if (input_padding_left >= effective_kernel_width) {
219 xnn_log_info(
220 "inefficiency in Convolution operator with %" PRIu32 "x%" PRIu32 " effective kernel and %" PRIu32 "+%" PRIu32 " width padding: "
221 "input left padding is greater or equal to effective kernel width",
222 effective_kernel_width, effective_kernel_height, input_padding_left, input_padding_right);
223 }
224
225 const float convolution_scale = input_scale * kernel_scale / output_scale;
226 if (convolution_scale >= 1.0f) {
227 xnn_log_error(
228 "failed to create Convolution operator with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
229 "convolution scale %.7g is greater or equal to 1.0",
230 input_scale, kernel_scale, output_scale, convolution_scale);
231 goto error;
232 }
233
234 status = xnn_status_out_of_memory;
235
236 convolution_op = xnn_allocate_zero_memory(sizeof(struct xnn_operator));
237 if (convolution_op == NULL) {
238 xnn_log_error("failed to allocate %zu bytes for Convolution operator descriptor", sizeof(struct xnn_operator));
239 goto error;
240 }
241
242 const size_t kernel_size = kernel_height * kernel_width;
243
244 enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_none;
245 const struct dwconv_parameters* dwconv_parameters = NULL;
246 const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
247 if (group_input_channels == 1 && group_output_channels == 1 && groups > 1 &&
248 (dwconv_parameters = find_dwigemm_ukernel(kernel_size, xnn_params.q8.dwconv, XNN_MAX_Q8_DWCONV_UKERNELS)) != NULL)
249 {
250 ukernel_type = xnn_ukernel_type_dwconv;
251 } else if (kernel_size == 1 && subsampling_height == 1 && subsampling_width == 1 && !any_padding) {
252 ukernel_type = xnn_ukernel_type_gemm;
253 } else {
254 ukernel_type = xnn_ukernel_type_igemm;
255 }
256
257 size_t zero_size = 0;
258 switch (ukernel_type) {
259 case xnn_ukernel_type_dwconv:
260 {
261 assert(dwconv_parameters != NULL);
262 assert(dwconv_parameters->mr == kernel_size);
263
264 const uint32_t c_stride = round_up_po2(groups, dwconv_parameters->cr);
265 const size_t packed_weights_size = (sizeof(uint8_t) * kernel_size + sizeof(int32_t)) * c_stride;
266 convolution_op->packed_weights = xnn_allocate_memory(packed_weights_size);
267 if (convolution_op->packed_weights == NULL) {
268 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_weights_size);
269 goto error;
270 }
271
Marat Dukhandd69f0b2019-10-04 19:40:03 -0700272 if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700273 xnn_pack_q8_dwconv_hwg_w(
274 kernel_height, kernel_width,
275 groups, dwconv_parameters->cr,
276 input_zero_point, kernel_zero_point,
277 kernel, bias, convolution_op->packed_weights);
278 } else {
279 xnn_pack_q8_dwconv_ghw_w(
280 kernel_height, kernel_width,
281 groups, dwconv_parameters->cr,
282 input_zero_point, kernel_zero_point,
283 kernel, bias, convolution_op->packed_weights);
284 }
285
286 convolution_op->ukernel.dwconv = (struct xnn_ukernel_dwconv) {
287 .unipass_function = dwconv_parameters->up,
288 .mr = dwconv_parameters->mr,
289 .qr = dwconv_parameters->qr,
290 };
291
292 zero_size = sizeof(uint8_t) * c_stride + XNN_EXTRA_BYTES;
293 break;
294 }
295 case xnn_ukernel_type_gemm:
296 case xnn_ukernel_type_igemm:
297 {
298 const uint32_t nr = xnn_params.q8.gemm.nr;
299 const uint32_t kr = UINT32_C(1) << xnn_params.q8.gemm.log2_kr;
300 const uint32_t n_stride = round_up(group_output_channels, nr);
301 const uint32_t k_stride = round_up_po2(group_input_channels, kr);
302
303 const size_t packed_group_weights_size =
304 (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride;
305 convolution_op->packed_weights = xnn_allocate_memory(packed_group_weights_size * groups);
306 if (convolution_op->packed_weights == NULL) {
307 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
308 goto error;
309 }
310 memset(convolution_op->packed_weights, kernel_zero_point, packed_group_weights_size * groups);
311
312 switch (ukernel_type) {
313 case xnn_ukernel_type_gemm:
314 xnn_pack_q8_gemm_goi_w(
315 groups, group_output_channels, group_input_channels,
316 nr, kr,
317 input_zero_point, kernel_zero_point,
318 kernel, bias, convolution_op->packed_weights);
319 convolution_op->ukernel.gemm = (struct xnn_ukernel_gemm) {
320 .mr = xnn_params.q8.gemm.mr,
321 .nr = nr,
322 .kr = kr,
323 .default_function = xnn_params.q8.gemm.gemm,
324 };
325 break;
326 case xnn_ukernel_type_igemm:
Marat Dukhandd69f0b2019-10-04 19:40:03 -0700327 if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700328 xnn_pack_q8_conv_kgo_w(
329 groups, group_output_channels, kernel_size,
330 nr, kr,
331 input_zero_point, kernel_zero_point,
332 kernel, bias, convolution_op->packed_weights);
333 } else {
334 xnn_pack_q8_conv_goki_w(
335 groups, group_output_channels, kernel_size, group_input_channels,
336 nr, kr,
337 input_zero_point, kernel_zero_point,
338 kernel, bias, convolution_op->packed_weights);
339 }
340 convolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
341 .mr = xnn_params.q8.gemm.mr,
342 .nr = nr,
343 .kr = kr,
344 .default_function = xnn_params.q8.gemm.igemm,
345 };
346 break;
347 default:
348 XNN_UNREACHABLE;
349 }
350
351 zero_size = sizeof(uint8_t) * k_stride + XNN_EXTRA_BYTES;
352 break;
353 }
354 default:
355 XNN_UNREACHABLE;
356 }
357
358 if (any_padding) {
359 void* zero_buffer = xnn_allocate_memory(zero_size);
360 if (zero_buffer == NULL) {
361 xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
362 goto error;
363 }
364 memset(zero_buffer, input_zero_point, zero_size);
365 convolution_op->zero_buffer = zero_buffer;
366 }
367
368 convolution_op->padding_top = input_padding_top;
369 convolution_op->padding_right = input_padding_right;
370 convolution_op->padding_bottom = input_padding_bottom;
371 convolution_op->padding_left = input_padding_left;
372
373 convolution_op->kernel_height = kernel_height;
374 convolution_op->kernel_width = kernel_width;
375 convolution_op->stride_height = subsampling_height;
376 convolution_op->stride_width = subsampling_width;
377 convolution_op->dilation_height = dilation_height;
378 convolution_op->dilation_width = dilation_width;
379 convolution_op->groups = groups;
380 convolution_op->group_input_channels = group_input_channels;
381 convolution_op->group_output_channels = group_output_channels;
382 convolution_op->input_pixel_stride = input_pixel_stride;
383 convolution_op->output_pixel_stride = output_pixel_stride;
384
385 convolution_op->kernel_zero_point = kernel_zero_point;
386
387 convolution_op->q8_gemm_params =
388 xnn_compute_q8_gemm_params(
389 input_zero_point, kernel_zero_point,
390 convolution_scale, output_zero_point, output_min, output_max);
391
392 convolution_op->type = xnn_operator_type_convolution_q8;
393 convolution_op->ukernel.type = ukernel_type;
394
395 convolution_op->state = xnn_run_state_invalid;
396
397 *convolution_op_out = convolution_op;
398 return xnn_status_success;
399
400error:
401 xnn_delete_operator(convolution_op);
402 return status;
403}
404
405enum xnn_status xnn_create_convolution2d_nhwc_f32(
406 uint32_t input_padding_top,
407 uint32_t input_padding_right,
408 uint32_t input_padding_bottom,
409 uint32_t input_padding_left,
410 uint32_t kernel_height,
411 uint32_t kernel_width,
412 uint32_t subsampling_height,
413 uint32_t subsampling_width,
414 uint32_t dilation_height,
415 uint32_t dilation_width,
416 uint32_t groups,
417 size_t group_input_channels,
418 size_t group_output_channels,
419 size_t input_pixel_stride,
420 size_t output_pixel_stride,
421 const float* kernel,
422 const float* bias,
423 float output_min,
424 float output_max,
425 uint32_t flags,
426 xnn_operator_t* convolution_op_out)
427{
428 xnn_operator_t convolution_op = NULL;
429 enum xnn_status status = xnn_status_uninitialized;
430
431 if (!xnn_params.initialized) {
432 xnn_log_error("failed to create Convolution operator: XNNPACK is not initialized");
433 goto error;
434 }
435
436 status = xnn_status_invalid_parameter;
437
438 if (kernel_width == 0 || kernel_height == 0) {
439 xnn_log_error(
440 "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
441 kernel_width, kernel_height);
442 goto error;
443 }
444
445 if (subsampling_width == 0 || subsampling_height == 0) {
446 xnn_log_error(
447 "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " subsampling: "
448 "subsampling dimensions must be non-zero",
449 subsampling_width, subsampling_height);
450 goto error;
451 }
452
453 if (dilation_width == 0 || dilation_height == 0) {
454 xnn_log_error(
455 "failed to create Convolution operator with %" PRIu32 "x%" PRIu32 " dilation: "
456 "dilation dimensions must be non-zero",
457 dilation_width, dilation_height);
458 goto error;
459 }
460
461 if (groups == 0) {
462 xnn_log_error(
463 "failed to create Convolution operator with %" PRIu32 " groups: number of groups must be non-zero", groups);
464 goto error;
465 }
466
467 if (group_input_channels == 0) {
468 xnn_log_error(
469 "failed to create Convolution operator with %zu input channels per group: "
470 "number of channels must be non-zero",
471 group_input_channels);
472 goto error;
473 }
474
475 if (group_output_channels == 0) {
476 xnn_log_error(
477 "failed to create Convolution operator with %zu output channels per group: "
478 "number of channels must be non-zero",
479 group_output_channels);
480 goto error;
481 }
482
483 const size_t input_channels = groups * group_input_channels;
484 if (input_pixel_stride < input_channels) {
485 xnn_log_error(
486 "failed to create Convolution operator with input pixel stride of %zu: "
487 "stride must be at least as large as the number of input channels (%" PRIu32 "x%zu)",
488 input_pixel_stride, groups, group_input_channels);
489 goto error;
490 }
491
492 const size_t output_channels = groups * group_output_channels;
493 if (output_pixel_stride < output_channels) {
494 xnn_log_error(
495 "failed to create Convolution operator with output pixel stride of %zu: "
496 "stride must be at least as large as the number of output channels (%" PRIu32 "x%zu)",
497 output_pixel_stride, groups, group_output_channels);
498 goto error;
499 }
500
501 if (isnan(output_min)) {
502 xnn_log_error(
503 "failed to create Convolution operator with NaN output lower bound: lower bound must be non-NaN");
504 goto error;
505 }
506
507 if (isnan(output_max)) {
508 xnn_log_error(
509 "failed to create Convolution operator with NaN output upper bound: upper bound must be non-NaN");
510 goto error;
511 }
512
513 if (output_min >= output_max) {
514 xnn_log_error(
515 "failed to create Convolution operator with [%.7g, %.7g] output range: "
516 "lower bound must be below upper bound",
517 output_min, output_max);
518 goto error;
519 }
520
Marat Dukhandd69f0b2019-10-04 19:40:03 -0700521 if ((flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) != 0 && group_input_channels != 1) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700522 xnn_log_error(
523 "failed to create Depthwise Convolution operator with %zu input channels per group: "
524 "Depthwise Convolution must have exactly 1 input channel per group",
525 group_input_channels);
526 goto error;
527 }
528
529 const uint32_t effective_kernel_height = (kernel_height - 1) * dilation_height + 1;
530 const uint32_t effective_kernel_width = (kernel_width - 1) * dilation_width + 1;
531
532 if (input_padding_top >= effective_kernel_height) {
533 xnn_log_info(
534 "inefficiency in Convolution operator with %" PRIu32 "x%" PRIu32 " effective kernel and %" PRIu32 "+%" PRIu32 " height padding: "
535 "input top padding is greater or equal to effective kernel height",
536 effective_kernel_width, effective_kernel_height, input_padding_top, input_padding_bottom);
537 }
538
539 if (input_padding_bottom >= effective_kernel_height) {
540 xnn_log_info(
541 "inefficiency in Convolution operator with %" PRIu32 "x%" PRIu32 " effective kernel and %" PRIu32 "+%" PRIu32 " height padding: "
542 "input bottom padding is greater or equal to effective kernel height",
543 effective_kernel_width, effective_kernel_height, input_padding_top, input_padding_bottom);
544 }
545
546 if (input_padding_right >= effective_kernel_width) {
547 xnn_log_info(
548 "inefficiency in Convolution operator with %" PRIu32 "x%" PRIu32 " effective kernel and %" PRIu32 "+%" PRIu32 " width padding: "
549 "input right padding is greater or equal to effective kernel width",
550 effective_kernel_width, effective_kernel_height, input_padding_left, input_padding_right);
551 }
552
553 if (input_padding_left >= effective_kernel_width) {
554 xnn_log_info(
555 "inefficiency in Convolution operator with %" PRIu32 "x%" PRIu32 " effective kernel and %" PRIu32 "+%" PRIu32 " width padding: "
556 "input left padding is greater or equal to effective kernel width",
557 effective_kernel_width, effective_kernel_height, input_padding_left, input_padding_right);
558 }
559
560 status = xnn_status_out_of_memory;
561
562 convolution_op = xnn_allocate_zero_memory(sizeof(struct xnn_operator));
563 if (convolution_op == NULL) {
564 xnn_log_error("failed to allocate %zu bytes for Convolution operator descriptor", sizeof(struct xnn_operator));
565 goto error;
566 }
567
568 const size_t kernel_size = kernel_height * kernel_width;
569
570 enum xnn_ukernel_type ukernel_type = xnn_ukernel_type_none;
571 const struct dwconv_parameters* dwconv_parameters = NULL;
572 const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
573 const bool unit_subsampling = (subsampling_width | subsampling_height) == 1;
574 if (group_input_channels == 1 && group_output_channels == 1 && kernel_size == 1 && unit_subsampling && !any_padding) {
575 ukernel_type = xnn_ukernel_type_vmulcaddc;
576 } else if (group_input_channels == 1 && group_output_channels == 1 && (dwconv_parameters =
577 find_dwigemm_ukernel(kernel_size, xnn_params.f32.dwconv, XNN_MAX_F32_DWCONV_UKERNELS)) != NULL)
578 {
579 ukernel_type = xnn_ukernel_type_dwconv;
580 } else if (kernel_size == 1 && unit_subsampling && !any_padding) {
581 ukernel_type = xnn_ukernel_type_gemm;
582 } else {
583 ukernel_type = xnn_ukernel_type_igemm;
584 }
585
586 size_t zero_size = 0;
587 switch (ukernel_type) {
588 case xnn_ukernel_type_vmulcaddc:
589 {
590 const uint32_t c_stride = round_up_po2(groups, xnn_params.f32.vmulcaddc.cr);
591 const size_t packed_weights_size = 2 * sizeof(float) * c_stride;
592 convolution_op->packed_weights = xnn_allocate_memory(packed_weights_size);
593 if (convolution_op->packed_weights == NULL) {
594 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_weights_size);
595 goto error;
596 }
597
598 xnn_pack_f32_vmulcaddc_w(
599 groups, xnn_params.f32.vmulcaddc.cr,
600 kernel, bias, convolution_op->packed_weights);
601
602 convolution_op->ukernel.vmulcaddc = (struct xnn_ukernel_vmulcaddc) {
603 .function = xnn_params.f32.vmulcaddc.ukernel,
604 .mr = xnn_params.f32.vmulcaddc.mr,
605 };
606 break;
607 }
608 case xnn_ukernel_type_dwconv:
609 {
610 assert(dwconv_parameters != NULL);
611 assert(dwconv_parameters->mr == kernel_size);
612
613 const uint32_t c_stride = round_up_po2(groups, dwconv_parameters->cr);
614 const size_t packed_weights_size = (kernel_size + 1) * sizeof(float) * c_stride;
615 convolution_op->packed_weights = xnn_allocate_memory(packed_weights_size);
616 if (convolution_op->packed_weights == NULL) {
617 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_weights_size);
618 goto error;
619 }
620
Marat Dukhandd69f0b2019-10-04 19:40:03 -0700621 if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700622 xnn_pack_f32_dwconv_hwg_w(
623 kernel_height, kernel_width,
624 groups, dwconv_parameters->cr,
625 kernel, bias, convolution_op->packed_weights);
626 } else {
627 xnn_pack_f32_dwconv_ghw_w(
628 kernel_height, kernel_width,
629 groups, dwconv_parameters->cr,
630 kernel, bias, convolution_op->packed_weights);
631 }
632
633 convolution_op->ukernel.dwconv = (struct xnn_ukernel_dwconv) {
634 .unipass_function = dwconv_parameters->up,
635 .mr = dwconv_parameters->mr,
636 .qr = dwconv_parameters->qr,
637 };
638
639 zero_size = sizeof(float) * c_stride;
640 break;
641 }
642 case xnn_ukernel_type_gemm:
643 case xnn_ukernel_type_igemm:
644 {
645 const uint32_t nr = xnn_params.f32.gemm.nr;
646 const uint32_t kr = UINT32_C(1) << xnn_params.f32.gemm.log2_kr;
647 const uint32_t sr = UINT32_C(1) << xnn_params.f32.gemm.log2_sr;
648 const uint32_t n_stride = round_up(group_output_channels, nr);
649 const uint32_t k_stride = round_up_po2(group_input_channels, kr);
650
651 const size_t packed_group_weights_size = (kernel_size * k_stride + 1) * sizeof(float) * n_stride;
652 convolution_op->packed_weights = xnn_allocate_memory(packed_group_weights_size * groups);
653 if (convolution_op->packed_weights == NULL) {
654 xnn_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
655 goto error;
656 }
657 memset(convolution_op->packed_weights, 0, packed_group_weights_size * groups);
658
659 switch (ukernel_type) {
660 case xnn_ukernel_type_gemm:
661 xnn_pack_f32_gemm_goi_w(
662 groups, group_output_channels, group_input_channels,
663 nr, kr, sr,
664 kernel, bias, convolution_op->packed_weights);
665 convolution_op->ukernel.gemm = (struct xnn_ukernel_gemm) {
666 .mr = xnn_params.f32.gemm.mr,
667 .nr = nr,
668 .kr = kr,
669 .default_function = xnn_params.f32.gemm.gemm,
670 .mr1_function = xnn_params.f32.gemm.gemm1,
671 };
672 break;
673 case xnn_ukernel_type_igemm:
Marat Dukhandd69f0b2019-10-04 19:40:03 -0700674 if (flags & XNN_FLAG_DEPTHWISE_CONVOLUTION) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700675 xnn_pack_f32_conv_kgo_w(
676 groups, group_output_channels, kernel_size,
677 nr, kr,
678 kernel, bias, convolution_op->packed_weights);
679 } else {
680 xnn_pack_f32_conv_goki_w(
681 groups, group_output_channels, kernel_size, group_input_channels,
682 nr, kr, sr,
683 kernel, bias, convolution_op->packed_weights);
684 }
685 convolution_op->ukernel.igemm = (struct xnn_ukernel_igemm) {
686 .mr = xnn_params.f32.gemm.mr,
687 .nr = nr,
688 .kr = kr,
689 .default_function = xnn_params.f32.gemm.igemm,
690 .mr1_function = xnn_params.f32.gemm.igemm1,
691 };
692 break;
693 default:
694 XNN_UNREACHABLE;
695 }
696
697 zero_size = sizeof(float) * k_stride;
698 break;
699 }
700 default:
701 XNN_UNREACHABLE;
702 }
703
704 if (any_padding) {
705 void* zero_buffer = xnn_allocate_zero_memory(zero_size);
706 if (zero_buffer == NULL) {
707 xnn_log_error("failed to allocate %zu bytes for zero padding", zero_size);
708 goto error;
709 }
710 convolution_op->zero_buffer = zero_buffer;
711 }
712
713 convolution_op->padding_top = input_padding_top;
714 convolution_op->padding_right = input_padding_right;
715 convolution_op->padding_bottom = input_padding_bottom;
716 convolution_op->padding_left = input_padding_left;
717
718 convolution_op->kernel_height = kernel_height;
719 convolution_op->kernel_width = kernel_width;
720 convolution_op->stride_height = subsampling_height;
721 convolution_op->stride_width = subsampling_width;
722 convolution_op->dilation_height = dilation_height;
723 convolution_op->dilation_width = dilation_width;
724 convolution_op->groups = groups;
725 convolution_op->group_input_channels = group_input_channels;
726 convolution_op->group_output_channels = group_output_channels;
727 convolution_op->input_pixel_stride = input_pixel_stride;
728 convolution_op->output_pixel_stride = output_pixel_stride;
729
730 convolution_op->f32_output_params = xnn_compute_f32_output_params(output_min, output_max);
731
732 convolution_op->type = xnn_operator_type_convolution_f32;
733 convolution_op->ukernel.type = ukernel_type;
734
735 convolution_op->state = xnn_run_state_invalid;
736
737 *convolution_op_out = convolution_op;
738 return xnn_status_success;
739
740error:
741 xnn_delete_operator(convolution_op);
742 return status;
743}
744
745static enum xnn_status setup_convolution2d_nhwc(
746 xnn_operator_t convolution_op,
747 size_t batch_size,
748 size_t input_height,
749 size_t input_width,
750 const void* input,
751 void* output,
752 uint32_t log2_input_element_size,
753 uint32_t log2_filter_element_size,
754 uint32_t bias_element_size,
755 uint32_t log2_output_element_size,
756 const void* params,
757 size_t num_threads)
758{
759 convolution_op->state = xnn_run_state_invalid;
760
761 if (!xnn_params.initialized) {
762 xnn_log_error("failed to setup Convolution operator: XNNPACK is not initialized");
763 return xnn_status_uninitialized;
764 }
765
766 if (input_width == 0 || input_height == 0) {
767 xnn_log_error(
768 "failed to setup Convolution operator with %zux%zu input: input dimensions must be non-zero",
769 input_width, input_height);
770 return xnn_status_invalid_parameter;
771 }
772
773 if (batch_size == 0) {
774 convolution_op->state = xnn_run_state_skip;
775 return xnn_status_success;
776 }
777
778 convolution_op->batch_size = batch_size;
779 convolution_op->input_height = input_height;
780 convolution_op->input_width = input_width;
781 convolution_op->input = input;
782
783 convolution_op->output_height = compute_output_dimension(
784 convolution_op->padding_top + input_height + convolution_op->padding_bottom,
785 convolution_op->kernel_height,
786 convolution_op->dilation_height,
787 convolution_op->stride_height);
788 convolution_op->output_width = compute_output_dimension(
789 convolution_op->padding_left + input_width + convolution_op->padding_right,
790 convolution_op->kernel_width,
791 convolution_op->dilation_width,
792 convolution_op->stride_width);
793 convolution_op->output = output;
794
795 switch (convolution_op->ukernel.type) {
796 case xnn_ukernel_type_gemm:
797 {
798 // Convolution maps directly to GEMM and doesn't use indirection buffer.
799
800 const size_t output_height = convolution_op->output_height;
801 const size_t output_width = convolution_op->output_width;
802 const size_t output_size = output_height * output_width;
803 const size_t batch_output_size = batch_size * output_size;
804
805 const size_t groups = convolution_op->groups;
806 const size_t group_input_channels = convolution_op->group_input_channels;
807 const size_t w_stride = (round_up_po2(group_input_channels, convolution_op->ukernel.gemm.kr) << log2_filter_element_size) + bias_element_size;
808 const size_t group_output_channels = convolution_op->group_output_channels;
809
810 uint32_t mr = convolution_op->ukernel.gemm.mr;
811 const uint32_t nr = convolution_op->ukernel.gemm.nr;
812 xnn_gemm_ukernel_function gemm_ukernel = convolution_op->ukernel.gemm.default_function;
813 if (batch_output_size == 1 && convolution_op->ukernel.gemm.mr1_function != NULL) {
814 mr = 1;
815 gemm_ukernel = convolution_op->ukernel.gemm.mr1_function;
816 }
817
818 convolution_op->context.gemm = (struct gemm_context) {
819 .k_scaled = group_input_channels << log2_input_element_size,
820 .a = input,
821 .a_stride = convolution_op->input_pixel_stride << log2_input_element_size,
822 .packed_w = convolution_op->packed_weights,
823 .w_stride = w_stride,
824 .wg_stride = w_stride * round_up(group_output_channels, nr),
825 .c = output,
826 .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
827 .cn_stride = nr << log2_output_element_size,
828 .cg_stride = group_output_channels << log2_output_element_size,
829 .log2_csize = log2_output_element_size,
830 .ukernel = gemm_ukernel,
831 };
832 memcpy(&convolution_op->context.gemm.params, params, sizeof(convolution_op->context.gemm.params));
833
834 size_t nc = group_output_channels;
835 if (num_threads > 1) {
836 const size_t num_other_tiles = groups * divide_round_up(batch_output_size, mr);
837 const size_t target_tiles_per_thread = 5;
838 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
839 if (max_nc < nc) {
840 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
841 }
842 }
843 if (groups == 1) {
844 convolution_op->compute.type = xnn_parallelization_type_2d_tile_2d;
845 convolution_op->compute.task_2d_tile_2d = (pthreadpool_task_2d_tile_2d_t) xnn_compute_gemm;
846 convolution_op->compute.range[0] = batch_output_size;
847 convolution_op->compute.range[1] = group_output_channels;
848 convolution_op->compute.tile[0] = mr;
849 convolution_op->compute.tile[1] = nc;
850 } else {
851 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
852 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_ggemm;
853 convolution_op->compute.range[0] = groups;
854 convolution_op->compute.range[1] = batch_output_size;
855 convolution_op->compute.range[2] = group_output_channels;
856 convolution_op->compute.tile[0] = mr;
857 convolution_op->compute.tile[1] = nc;
858 }
859 convolution_op->state = xnn_run_state_ready;
860
861 return xnn_status_success;
862 }
863 case xnn_ukernel_type_igemm:
864 {
865 const size_t groups = convolution_op->groups;
866 const size_t kernel_height = convolution_op->kernel_height;
867 const size_t kernel_width = convolution_op->kernel_width;
868 const size_t kernel_size = kernel_height * kernel_width;
869 const size_t output_height = convolution_op->output_height;
870 const size_t output_width = convolution_op->output_width;
871 const size_t output_size = output_height * output_width;
872
873 uint32_t mr = convolution_op->ukernel.igemm.mr;
874 const uint32_t nr = convolution_op->ukernel.igemm.nr;
875 xnn_igemm_ukernel_function igemm_ukernel = convolution_op->ukernel.igemm.default_function;
876 if (output_size == 1 && convolution_op->ukernel.igemm.mr1_function != NULL) {
877 mr = 1;
878 igemm_ukernel = convolution_op->ukernel.igemm.mr1_function;
879 }
880
881 const size_t tiled_output_size = round_up(output_size, mr);
882 const size_t indirection_buffer_size = sizeof(void*) * kernel_size * tiled_output_size;
883
884 if (input_height != convolution_op->last_input_height ||
885 input_width != convolution_op->last_input_width)
886 {
887 const void** indirection_buffer = (const void**) realloc(convolution_op->indirection_buffer, indirection_buffer_size);
888 if (indirection_buffer == NULL) {
889 xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
890 return xnn_status_out_of_memory;
891 }
892 convolution_op->indirection_buffer = indirection_buffer;
893 convolution_op->last_input = input;
894 convolution_op->last_input_height = input_height;
895 convolution_op->last_input_width = input_width;
896
897 xnn_indirection_init_conv2d(convolution_op, mr, log2_input_element_size);
898 }
899
900 const size_t group_input_channels = convolution_op->group_input_channels;
901 const size_t w_stride = (round_up_po2(group_input_channels, convolution_op->ukernel.igemm.kr) * kernel_size << log2_filter_element_size) + bias_element_size;
902 const size_t group_output_channels = convolution_op->group_output_channels;
903 convolution_op->context.igemm = (struct igemm_context) {
904 .ks = kernel_size,
905 .ks_scaled = kernel_size * mr * sizeof(void*),
906 .kc = group_input_channels << log2_input_element_size,
907 .w_stride = w_stride,
908 .indirect_a = convolution_op->indirection_buffer,
909 .a_offset = (size_t) ((uintptr_t) input - (uintptr_t) convolution_op->last_input),
910 .zero = convolution_op->zero_buffer,
911 .packed_w = convolution_op->packed_weights,
912 .c = convolution_op->output,
913 .cm_stride = convolution_op->output_pixel_stride << log2_output_element_size,
914 .cn_stride = nr << log2_output_element_size,
915 .ga_stride = group_input_channels << log2_input_element_size,
916 .gw_stride = w_stride * round_up(group_output_channels, nr),
917 .gc_stride = group_output_channels << log2_output_element_size,
918 .ba_stride = input_height * input_width * convolution_op->input_pixel_stride << log2_input_element_size,
919 .bc_stride = output_size * convolution_op->output_pixel_stride << log2_output_element_size,
920 .log2_csize = log2_output_element_size,
921 .ukernel = igemm_ukernel,
922 };
923 memcpy(&convolution_op->context.igemm.params, params, sizeof(convolution_op->context.igemm.params));
924
925 size_t nc = group_output_channels;
926 if (num_threads > 1) {
927 const size_t num_other_tiles = groups * batch_size * divide_round_up(output_size, mr);
928 const size_t target_tiles_per_thread = 5;
929 const size_t max_nc = divide_round_up(group_output_channels * num_other_tiles, num_threads * target_tiles_per_thread);
930 if (max_nc < nc) {
931 nc = min(nc, divide_round_up(nc, max_nc * nr) * nr);
932 }
933 }
934 if (groups == 1) {
935 convolution_op->compute.type = xnn_parallelization_type_3d_tile_2d;
936 convolution_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_igemm;
937 convolution_op->compute.range[0] = batch_size;
938 convolution_op->compute.range[1] = output_size;
939 convolution_op->compute.range[2] = group_output_channels;
940 convolution_op->compute.tile[0] = mr;
941 convolution_op->compute.tile[1] = nc;
942 } else {
943 convolution_op->compute.type = xnn_parallelization_type_4d_tile_2d;
944 convolution_op->compute.task_4d_tile_2d = (pthreadpool_task_4d_tile_2d_t) xnn_compute_gigemm;
945 convolution_op->compute.range[0] = batch_size;
946 convolution_op->compute.range[1] = groups;
947 convolution_op->compute.range[2] = output_size;
948 convolution_op->compute.range[3] = group_output_channels;
949 convolution_op->compute.tile[0] = mr;
950 convolution_op->compute.tile[1] = nc;
951 }
952 convolution_op->state = xnn_run_state_ready;
953
954 return xnn_status_success;
955 }
956 case xnn_ukernel_type_dwconv:
957 {
958 size_t valid_batch_size = 0;
959 if (input == convolution_op->last_input &&
960 input_height == convolution_op->last_input_height &&
961 input_width == convolution_op->last_input_width)
962 {
963 valid_batch_size = convolution_op->valid_batch_size;
964 if (batch_size <= valid_batch_size) {
965 convolution_op->compute.range[0] = batch_size * convolution_op->output_height;
966 convolution_op->state = xnn_run_state_ready;
967 return xnn_status_success;
968 }
969 }
970
971 const size_t kernel_height = convolution_op->kernel_height;
972 const size_t kernel_width = convolution_op->kernel_width;
973 const size_t kernel_size = kernel_height * kernel_width;
974 const size_t output_height = convolution_op->output_height;
975 const size_t output_width = convolution_op->output_width;
976 const size_t step_width = convolution_op->dilation_width == 1 ? convolution_op->stride_width : kernel_width;
977 const size_t step_height = kernel_size + (output_width * step_width - 1) * kernel_height;
978 const size_t indirection_buffer_size = sizeof(void*) * batch_size * output_height * step_height;
979
980 const void** indirection_buffer =
981 (const void**) realloc(convolution_op->indirection_buffer, indirection_buffer_size);
982 if (indirection_buffer == NULL) {
983 xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
984 return xnn_status_out_of_memory;
985 }
986 convolution_op->indirection_buffer = indirection_buffer;
987
988 xnn_indirection_init_dwconv2d(convolution_op, valid_batch_size, step_height, step_width, log2_input_element_size);
989
990 const size_t groups = convolution_op->groups;
991 convolution_op->context.dwconv = (struct dwconv_context) {
992 .groups = groups,
993 .indirection_buffer = convolution_op->indirection_buffer,
994 .indirection_buffer_row_stride = step_height,
995 .indirection_buffer_col_stride = kernel_height * step_width * sizeof(void*),
996 .packed_weights = convolution_op->packed_weights,
997 .output = convolution_op->output,
998 .output_width = output_width,
999 .output_row_stride = output_width * convolution_op->output_pixel_stride << log2_output_element_size,
1000 .output_col_increment = (convolution_op->output_pixel_stride - groups) << log2_output_element_size,
1001 .unipass_ukernel = convolution_op->ukernel.dwconv.unipass_function,
1002 };
1003 memcpy(&convolution_op->context.dwconv.params, params, sizeof(convolution_op->context.dwconv.params));
1004
1005 convolution_op->compute.type = xnn_parallelization_type_1d;
1006 convolution_op->compute.task_1d = (pthreadpool_task_1d_t) xnn_compute_dwconv_unipass;
1007 convolution_op->compute.range[0] = batch_size * output_height;
1008 convolution_op->state = xnn_run_state_ready;
1009
1010 convolution_op->last_input = input;
1011 convolution_op->last_input_height = input_height;
1012 convolution_op->last_input_width = input_width;
1013 convolution_op->valid_batch_size = max(valid_batch_size, batch_size);
1014
1015 return xnn_status_success;
1016 }
1017 case xnn_ukernel_type_vmulcaddc:
1018 {
1019 const size_t batch_output_size = batch_size * convolution_op->output_height * convolution_op->output_width;
1020
1021 convolution_op->context.vmulcaddc = (struct vmulcaddc_context) {
1022 .n = convolution_op->groups << log2_input_element_size,
1023 .x = input,
1024 .x_stride = convolution_op->input_pixel_stride << log2_input_element_size,
1025 .w = convolution_op->packed_weights,
1026 .y = output,
1027 .y_stride = convolution_op->output_pixel_stride << log2_output_element_size,
1028 .ukernel = convolution_op->ukernel.vmulcaddc.function,
1029 };
1030 memcpy(&convolution_op->context.vmulcaddc.params, params, sizeof(convolution_op->context.vmulcaddc.params));
1031
1032 size_t mc = batch_output_size;
1033 if (num_threads > 1) {
1034 const size_t target_tiles_per_thread = 5;
1035 const size_t max_mc = divide_round_up(batch_output_size, num_threads * target_tiles_per_thread);
1036 if (max_mc < mc) {
1037 const uint32_t mr = convolution_op->ukernel.vmulcaddc.mr;
1038 mc = min(mc, divide_round_up(mc, max_mc * mr) * mr);
1039 }
1040 }
1041 convolution_op->compute.type = xnn_parallelization_type_1d_tile_1d;
1042 convolution_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_vmulcaddc;
1043 convolution_op->compute.range[0] = batch_output_size;
1044 convolution_op->compute.tile[0] = mc;
1045 convolution_op->state = xnn_run_state_ready;
1046
1047 return xnn_status_success;
1048 }
1049 default:
1050 XNN_UNREACHABLE;
1051 }
1052}
1053
1054enum xnn_status xnn_setup_convolution2d_nhwc_q8(
1055 xnn_operator_t convolution_op,
1056 size_t batch_size,
1057 size_t input_height,
1058 size_t input_width,
1059 const uint8_t* input,
1060 uint8_t* output,
1061 pthreadpool_t threadpool)
1062{
1063 if (convolution_op->type != xnn_operator_type_convolution_q8) {
1064 xnn_log_error("failed to setup Convolution (Q8) operator: operator type mismatch");
1065 return xnn_status_invalid_parameter;
1066 }
1067
1068 return setup_convolution2d_nhwc(
1069 convolution_op,
1070 batch_size, input_height, input_width,
1071 input, output,
1072 0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
1073 0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
1074 sizeof(int32_t) /* sizeof(bias element) */,
1075 0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
1076 &convolution_op->q8_gemm_params,
1077 pthreadpool_get_threads_count(threadpool));
1078}
1079
1080enum xnn_status xnn_setup_convolution2d_nhwc_f32(
1081 xnn_operator_t convolution_op,
1082 size_t batch_size,
1083 size_t input_height,
1084 size_t input_width,
1085 const float* input,
1086 float* output,
1087 pthreadpool_t threadpool)
1088{
1089 if (convolution_op->type != xnn_operator_type_convolution_f32) {
1090 xnn_log_error("failed to setup Convolution (F32) operator: operator type mismatch");
1091 return xnn_status_invalid_parameter;
1092 }
1093
1094 return setup_convolution2d_nhwc(
1095 convolution_op,
1096 batch_size, input_height, input_width,
1097 input, output,
1098 2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
1099 2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
1100 sizeof(float) /* sizeof(bias element) */,
1101 2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
1102 &convolution_op->f32_output_params,
1103 pthreadpool_get_threads_count(threadpool));
1104}