blob: 5d601af96b9aa2280cea068dda7065847ae9fe1e [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <assert.h>
10#include <math.h>
11#include <stdbool.h>
12#include <stddef.h>
13#include <stdint.h>
14#include <stdlib.h>
15#include <string.h>
16
17#include <xnnpack.h>
18#include <xnnpack/allocator.h>
19#include <xnnpack/operator.h>
20#include <xnnpack/log.h>
21#include <xnnpack/common.h>
22#include <xnnpack/math.h>
23#include <xnnpack/params.h>
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -070024#include <xnnpack/params-init.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070025#include <xnnpack/indirection.h>
26
27
28static inline size_t compute_output_dimension(
29 size_t padded_input_dimension,
30 size_t pooling_dimension,
31 size_t stride_dimension)
32{
33 return (padded_input_dimension - pooling_dimension) / stride_dimension + 1;
34}
35
36enum xnn_status xnn_create_average_pooling2d_nhwc_q8(
37 uint32_t input_padding_top,
38 uint32_t input_padding_right,
39 uint32_t input_padding_bottom,
40 uint32_t input_padding_left,
41 uint32_t pooling_height,
42 uint32_t pooling_width,
43 uint32_t stride_height,
44 uint32_t stride_width,
45 size_t channels,
46 size_t input_pixel_stride,
47 size_t output_pixel_stride,
48 uint8_t input_zero_point,
49 float input_scale,
50 uint8_t output_zero_point,
51 float output_scale,
52 uint8_t output_min,
53 uint8_t output_max,
54 uint32_t flags,
55 xnn_operator_t* average_pooling_op_out)
56{
57 xnn_operator_t average_pooling_op = NULL;
58 enum xnn_status status = xnn_status_uninitialized;
59
60 if (!xnn_params.initialized) {
61 xnn_log_error("failed to create Average Pooling operator: XNNPACK is not initialized");
62 goto error;
63 }
64
65 status = xnn_status_invalid_parameter;
66
67 const uint32_t pooling_size = pooling_height * pooling_width;
68 if (pooling_size == 0) {
69 xnn_log_error(
70 "failed to create Average Pooling operator with %" PRIu32 "x%" PRIu32 " pooling size: "
71 "pooling size dimensions must be non-zero",
72 pooling_width, pooling_height);
73 goto error;
74 }
75
76 if (pooling_size == 1) {
77 xnn_log_error(
78 "failed to create Average Pooling operator with 1 pooling element: 1x1 pooling is meaningless");
79 goto error;
80 }
81
82 if (stride_height == 0 || stride_width == 0) {
83 xnn_log_error(
84 "failed to create Average Pooling operator with %" PRIu32 "x%" PRIu32 " stride: "
85 "stride dimensions must be non-zero",
86 stride_width, stride_height);
87 goto error;
88 }
89
90 if (channels == 0) {
91 xnn_log_error(
92 "failed to create Average Pooling operator with %zu channels: number of channels must be non-zero",
93 channels);
94 goto error;
95 }
96
97 if (input_pixel_stride < channels) {
98 xnn_log_error(
99 "failed to create Average Pooling operator with input pixel stride of %zu: "
100 "stride must be at least as large as the number of channels (%zu)",
101 input_pixel_stride, channels);
102 goto error;
103 }
104
105 if (output_pixel_stride < channels) {
106 xnn_log_error(
107 "failed to create Average Pooling operator with output pixel stride of %zu: "
108 "stride must be at least as large as the number of channels (%zu)",
109 output_pixel_stride, channels);
110 goto error;
111 }
112
113 if (input_scale <= 0.0f || !isnormal(input_scale)) {
114 xnn_log_error(
115 "failed to create Average Pooling operator with %.7g input scale: "
116 "scale must be finite, normalized, and positive",
117 input_scale);
118 goto error;
119 }
120
121 if (output_scale <= 0.0f || !isnormal(output_scale)) {
122 xnn_log_error(
123 "failed to create Average Pooling operator with %.7g output scale: "
124 "scale must be finite, normalized, and positive",
125 output_scale);
126 goto error;
127 }
128
129 if (output_min >= output_max) {
130 xnn_log_error(
131 "failed to create Average Pooling operator with [%" PRIu8 ", %" PRIu8 "] output range: "
132 "range min must be below range max",
133 output_min, output_max);
134 goto error;
135 }
136
137 status = xnn_status_unsupported_parameter;
138
139 const float input_output_scale = input_scale / output_scale;
140 if (input_output_scale < 0x1.0p-8f || input_output_scale >= 0x1.0p+8f) {
141 xnn_log_error(
142 "failed to create Average Pooling operator with %.7g input scale and %.7g output scale: "
143 "input-to-output scale ratio (%.7f) must be in [2**-8, 2**8) range",
144 input_scale, output_scale, input_output_scale);
145 goto error;
146 }
147
148 if (pooling_size >= 16777216) {
149 xnn_log_error(
150 "failed to create Average Pooling operator with %"PRIu32" (%" PRIu32 "x%" PRIu32 ") pooling elements: "
151 "the number of elements in the pooling area must be below 2**24",
152 pooling_size, pooling_width, pooling_height);
153 goto error;
154 }
155
156 status = xnn_status_out_of_memory;
157
158 average_pooling_op = xnn_allocate_zero_memory(sizeof(struct xnn_operator));
159 if (average_pooling_op == NULL) {
160 xnn_log_error("failed to allocate %zu bytes for Average Pooling operator descriptor", sizeof(struct xnn_operator));
161 goto error;
162 }
163
164 const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
165 const uint32_t mr = xnn_params.q8.avgpool.mr;
166 const uint32_t qr = xnn_params.q8.avgpool.qr;
167 if (any_padding || pooling_size < mr || (pooling_size - mr) % qr != 0) {
168 void* zero_buffer = xnn_allocate_memory(channels * sizeof(uint8_t) + XNN_EXTRA_BYTES);
169 if (zero_buffer == NULL) {
170 xnn_log_error("failed to allocate %zu bytes for Average Pooling zero padding",
171 channels * sizeof(uint8_t) + XNN_EXTRA_BYTES);
172 goto error;
173 }
174 memset(zero_buffer, input_zero_point, channels * sizeof(uint8_t));
175 average_pooling_op->zero_buffer = zero_buffer;
176 }
177
178 average_pooling_op->padding_top = input_padding_top;
179 average_pooling_op->padding_right = input_padding_right;
180 average_pooling_op->padding_bottom = input_padding_bottom;
181 average_pooling_op->padding_left = input_padding_left;
182
183 average_pooling_op->kernel_height = pooling_height;
184 average_pooling_op->kernel_width = pooling_width;
185 average_pooling_op->stride_height = stride_height;
186 average_pooling_op->stride_width = stride_width;
187 average_pooling_op->dilation_height = 1;
188 average_pooling_op->dilation_width = 1;
189 average_pooling_op->channels = channels;
190 average_pooling_op->input_pixel_stride = input_pixel_stride;
191 average_pooling_op->output_pixel_stride = output_pixel_stride;
192
193 // Number of rows read in the micro-kernel.
194 const size_t nrows = round_up(doz(pooling_size, mr), qr) + mr;
195 average_pooling_op->q8_avgpool_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700196 xnn_init_q8_avgpool_params(
XNNPACK Teamb455b122019-09-27 18:10:33 -0700197 (int32_t) -((uint32_t) input_zero_point * (uint32_t) nrows),
198 input_scale / (output_scale * (float) pooling_size),
199 output_zero_point, output_min, output_max);
200
201 average_pooling_op->type = xnn_operator_type_average_pooling_q8;
202 average_pooling_op->ukernel.type = xnn_ukernel_type_average_pooling;
203
204 *average_pooling_op_out = average_pooling_op;
205 return xnn_status_success;
206
207error:
208 xnn_delete_operator(average_pooling_op);
209 return status;
210}
211
212enum xnn_status xnn_create_average_pooling2d_nhwc_f32(
213 uint32_t input_padding_top,
214 uint32_t input_padding_right,
215 uint32_t input_padding_bottom,
216 uint32_t input_padding_left,
217 uint32_t pooling_height,
218 uint32_t pooling_width,
219 uint32_t stride_height,
220 uint32_t stride_width,
221 size_t channels,
222 size_t input_pixel_stride,
223 size_t output_pixel_stride,
224 float output_min,
225 float output_max,
226 uint32_t flags,
227 xnn_operator_t* average_pooling_op_out)
228{
229 xnn_operator_t average_pooling_op = NULL;
230 enum xnn_status status = xnn_status_uninitialized;
231
232 if (!xnn_params.initialized) {
233 xnn_log_error("failed to create Average Pooling operator: XNNPACK is not initialized");
234 goto error;
235 }
236
237 status = xnn_status_invalid_parameter;
238
239 const uint32_t pooling_size = pooling_height * pooling_width;
240 if (pooling_size == 0) {
241 xnn_log_error(
242 "failed to create Average Pooling operator with %" PRIu32 "x%" PRIu32 " pooling size: "
243 "pooling size dimensions must be non-zero",
244 pooling_width, pooling_height);
245 goto error;
246 }
247
248 if (pooling_size == 1) {
249 xnn_log_error(
250 "failed to create Average Pooling operator with 1 pooling element: 1x1 pooling is meaningless");
251 goto error;
252 }
253
254 if (stride_height == 0 || stride_width == 0) {
255 xnn_log_error(
256 "failed to create Average Pooling operator with %" PRIu32 "x%" PRIu32 " stride: "
257 "stride dimensions must be non-zero",
258 stride_width, stride_height);
259 goto error;
260 }
261
262 if (channels == 0) {
263 xnn_log_error(
264 "failed to create Average Pooling operator with %zu channels: number of channels must be non-zero",
265 channels);
266 goto error;
267 }
268
269 if (input_pixel_stride < channels) {
270 xnn_log_error(
271 "failed to create Average Pooling operator with input pixel stride of %zu: "
272 "stride must be at least as large as the number of channels (%zu)",
273 input_pixel_stride, channels);
274 goto error;
275 }
276
277 if (output_pixel_stride < channels) {
278 xnn_log_error(
279 "failed to create Average Pooling operator with output pixel stride of %zu: "
280 "stride must be at least as large as the number of channels (%zu)",
281 output_pixel_stride, channels);
282 goto error;
283 }
284
285 if (isnan(output_min)) {
286 xnn_log_error(
287 "failed to create Average Pooling operator with NaN output lower bound: lower bound must be non-NaN");
288 goto error;
289 }
290
291 if (isnan(output_max)) {
292 xnn_log_error(
293 "failed to create Average Pooling operator with NaN output upper bound: upper bound must be non-NaN");
294 goto error;
295 }
296
297 if (output_min >= output_max) {
298 xnn_log_error(
299 "failed to create Average Pooling operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
300 output_min, output_max);
301 goto error;
302 }
303
304 status = xnn_status_out_of_memory;
305
306 average_pooling_op = xnn_allocate_zero_memory(sizeof(struct xnn_operator));
307 if (average_pooling_op == NULL) {
308 xnn_log_error("failed to allocate %zu bytes for Average Pooling operator descriptor", sizeof(struct xnn_operator));
309 goto error;
310 }
311
312 const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
313 const uint32_t mr = xnn_params.f32.avgpool.mr;
314 const uint32_t qr = xnn_params.f32.avgpool.qr;
315 if (any_padding || pooling_size < mr || (pooling_size - mr) % qr != 0) {
316 void* zero_buffer = xnn_allocate_memory(channels * sizeof(float) + XNN_EXTRA_BYTES);
317 if (zero_buffer == NULL) {
318 xnn_log_error("failed to allocate %zu bytes for Average Pooling zero padding",
319 channels * sizeof(float) + XNN_EXTRA_BYTES);
320 goto error;
321 }
322 memset(zero_buffer, 0, channels * sizeof(float));
323 average_pooling_op->zero_buffer = zero_buffer;
324 }
325
326 average_pooling_op->padding_top = input_padding_top;
327 average_pooling_op->padding_right = input_padding_right;
328 average_pooling_op->padding_bottom = input_padding_bottom;
329 average_pooling_op->padding_left = input_padding_left;
330
331 average_pooling_op->kernel_height = pooling_height;
332 average_pooling_op->kernel_width = pooling_width;
333 average_pooling_op->stride_height = stride_height;
334 average_pooling_op->stride_width = stride_width;
335 average_pooling_op->dilation_height = 1;
336 average_pooling_op->dilation_width = 1;
337 average_pooling_op->channels = channels;
338 average_pooling_op->input_pixel_stride = input_pixel_stride;
339 average_pooling_op->output_pixel_stride = output_pixel_stride;
340
341 average_pooling_op->type = xnn_operator_type_average_pooling_f32;
342 if (any_padding) {
343 average_pooling_op->f32_output_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700344 xnn_init_f32_output_params(output_min, output_max);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700345
346 average_pooling_op->ukernel.type = xnn_ukernel_type_pixelwise_average_pooling;
347 } else {
348 average_pooling_op->f32_avgpool_params =
Marat Dukhaneeaa7bd2019-10-25 17:31:25 -0700349 xnn_init_f32_avgpool_params(1.0f / (float) pooling_size, output_min, output_max);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700350
351 average_pooling_op->ukernel.type = xnn_ukernel_type_average_pooling;
352 }
353
354 *average_pooling_op_out = average_pooling_op;
355 return xnn_status_success;
356
357error:
358 xnn_delete_operator(average_pooling_op);
359 return status;
360}
361
362enum xnn_status xnn_setup_average_pooling2d_nhwc_q8(
363 xnn_operator_t average_pooling_op,
364 size_t batch_size,
365 size_t input_height,
366 size_t input_width,
367 const uint8_t* input,
368 uint8_t* output,
369 pthreadpool_t threadpool)
370{
371 if (average_pooling_op->type != xnn_operator_type_average_pooling_q8) {
372 xnn_log_error("failed to setup Average Pooling (Q8) operator: operator type mismatch");
373 return xnn_status_invalid_parameter;
374 }
375 average_pooling_op->state = xnn_run_state_invalid;
376
377 if (!xnn_params.initialized) {
378 xnn_log_error("failed to setup Average Pooling operator: XNNPACK is not initialized");
379 return xnn_status_uninitialized;
380 }
381
382 if (input_width == 0 || input_height == 0) {
383 xnn_log_error(
384 "failed to setup Average Pooling operator with %zux%zu input: input dimensions must be non-zero",
385 input_width, input_height);
386 return xnn_status_invalid_parameter;
387 }
388
389 if (batch_size == 0) {
390 average_pooling_op->state = xnn_run_state_skip;
391 return xnn_status_success;
392 }
393
394 average_pooling_op->batch_size = batch_size;
395 average_pooling_op->input_height = input_height;
396 average_pooling_op->input_width = input_width;
397 average_pooling_op->input = input;
398
399 average_pooling_op->output_height = compute_output_dimension(
400 average_pooling_op->padding_top + input_height + average_pooling_op->padding_bottom,
401 average_pooling_op->kernel_height,
402 average_pooling_op->stride_height);
403 average_pooling_op->output_width = compute_output_dimension(
404 average_pooling_op->padding_left + input_width + average_pooling_op->padding_right,
405 average_pooling_op->kernel_width,
406 average_pooling_op->stride_width);
407 average_pooling_op->output = output;
408
409 size_t valid_batch_size = 0;
410 if (input == average_pooling_op->last_input &&
411 input_height == average_pooling_op->last_input_height &&
412 input_width == average_pooling_op->last_input_width)
413 {
414 valid_batch_size = average_pooling_op->valid_batch_size;
415 if (batch_size <= valid_batch_size) {
416 average_pooling_op->compute.range[0] = batch_size;
417 average_pooling_op->state = xnn_run_state_ready;
418 return xnn_status_success;
419 }
420 }
421
422 const size_t pooling_height = average_pooling_op->kernel_height;
423 const size_t pooling_width = average_pooling_op->kernel_width;
424 const size_t pooling_size = pooling_height * pooling_width;
425 const size_t output_height = average_pooling_op->output_height;
426 const size_t output_width = average_pooling_op->output_width;
427 // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
428 const uint32_t mr = xnn_params.q8.avgpool.mr;
429
430 const size_t step_width = min(average_pooling_op->stride_width, pooling_width);
431 const size_t step_height = pooling_size + (output_width * step_width - 1) * pooling_height;
432 const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + batch_size * output_height * step_height);
433
434 const void** indirection_buffer = (const void**) realloc(average_pooling_op->indirection_buffer, indirection_buffer_size);
435 if (indirection_buffer == NULL) {
436 xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
437 return xnn_status_out_of_memory;
438 }
439 average_pooling_op->indirection_buffer = indirection_buffer;
440
441 xnn_indirection_init_dwconv2d(
442 average_pooling_op, valid_batch_size, step_height, step_width, 0 /* log2(sizeof(uint8_t)) */);
443
444 const uint32_t qr = xnn_params.q8.avgpool.qr;
445 const size_t channels = average_pooling_op->channels;
446
447 const size_t indirect_input_height_stride = step_height * sizeof(void*);
448 const size_t output_width_stride = average_pooling_op->output_pixel_stride * sizeof(uint8_t);
449 const size_t output_height_stride = output_width * output_width_stride;
450
451 const size_t multipass_adjustment =
452 pooling_size > mr ? round_up(pooling_size - mr, qr) + mr - qr : 0;
453 average_pooling_op->context.average_pooling = (struct average_pooling_context) {
454 .indirect_input = indirection_buffer,
455 .indirect_input_batch_stride = output_height * indirect_input_height_stride,
456 .indirect_input_height_stride = indirect_input_height_stride,
457 .output = output,
458 .output_batch_stride = output_height * output_height_stride,
459 .output_height_stride = output_height_stride,
460 .output_width = output_width,
461 .pooling_size = pooling_size,
462 .channels = channels,
463 .zero = average_pooling_op->zero_buffer,
464 .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
465 .output_increment = output_width_stride - channels * sizeof(uint8_t),
466 .params.q8 = average_pooling_op->q8_avgpool_params,
467 };
468 average_pooling_op->compute.type = xnn_parallelization_type_2d;
469 average_pooling_op->compute.range[0] = batch_size;
470 average_pooling_op->compute.range[1] = output_height;
471
472 if (pooling_size <= mr) {
473 average_pooling_op->context.average_pooling.unipass_ukernel = xnn_params.q8.avgpool.up;
474 average_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_average_pooling_unipass;
475 } else {
476 average_pooling_op->context.average_pooling.multipass_ukernel = xnn_params.q8.avgpool.mp;
477 average_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_average_pooling_multipass;
478 }
479 average_pooling_op->state = xnn_run_state_ready;
480
481 average_pooling_op->last_input = input;
482 average_pooling_op->last_input_height = input_height;
483 average_pooling_op->last_input_width = input_width;
484 average_pooling_op->valid_batch_size = max(valid_batch_size, batch_size);
485
486 return xnn_status_success;
487}
488
489enum xnn_status xnn_setup_average_pooling2d_nhwc_f32(
490 xnn_operator_t average_pooling_op,
491 size_t batch_size,
492 size_t input_height,
493 size_t input_width,
494 const float* input,
495 float* output,
496 pthreadpool_t threadpool)
497{
498 if (average_pooling_op->type != xnn_operator_type_average_pooling_f32) {
499 xnn_log_error("failed to setup Average Pooling (F32) operator: operator type mismatch");
500 return xnn_status_invalid_parameter;
501 }
502 average_pooling_op->state = xnn_run_state_invalid;
503
504 if (!xnn_params.initialized) {
505 xnn_log_error("failed to setup Average Pooling operator: XNNPACK is not initialized");
506 return xnn_status_uninitialized;
507 }
508
509 if (input_width == 0 || input_height == 0) {
510 xnn_log_error(
511 "failed to setup Average Pooling operator with %zux%zu input: input dimensions must be non-zero",
512 input_width, input_height);
513 return xnn_status_invalid_parameter;
514 }
515
516 if (batch_size == 0) {
517 average_pooling_op->state = xnn_run_state_skip;
518 return xnn_status_success;
519 }
520
521 average_pooling_op->batch_size = batch_size;
522 average_pooling_op->input_height = input_height;
523 average_pooling_op->input_width = input_width;
524 average_pooling_op->input = input;
525
526 average_pooling_op->output_height = compute_output_dimension(
527 average_pooling_op->padding_top + input_height + average_pooling_op->padding_bottom,
528 average_pooling_op->kernel_height,
529 average_pooling_op->stride_height);
530 average_pooling_op->output_width = compute_output_dimension(
531 average_pooling_op->padding_left + input_width + average_pooling_op->padding_right,
532 average_pooling_op->kernel_width,
533 average_pooling_op->stride_width);
534 average_pooling_op->output = output;
535
536 size_t valid_batch_size = 0;
537 if (input == average_pooling_op->last_input &&
538 input_height == average_pooling_op->last_input_height &&
539 input_width == average_pooling_op->last_input_width)
540 {
541 valid_batch_size = average_pooling_op->valid_batch_size;
542 if (batch_size <= valid_batch_size) {
543 average_pooling_op->compute.range[0] = batch_size;
544 average_pooling_op->state = xnn_run_state_ready;
545 return xnn_status_success;
546 }
547 }
548
549 const size_t pooling_height = average_pooling_op->kernel_height;
550 const size_t pooling_width = average_pooling_op->kernel_width;
551 const size_t pooling_size = pooling_height * pooling_width;
552 const size_t output_height = average_pooling_op->output_height;
553 const size_t output_width = average_pooling_op->output_width;
554 // Micro-kernel may read up to (mr - 1) elements after the end of indirection buffer.
555 const uint32_t mr = xnn_params.f32.avgpool.mr;
556 assert(mr == xnn_params.f32.pavgpool.mr);
557
558 const size_t step_width = min(average_pooling_op->stride_width, pooling_width);
559 const size_t step_height = pooling_size + (output_width * step_width - 1) * pooling_height;
560 const size_t indirection_buffer_size = sizeof(void*) * ((mr - 1) + batch_size * output_height * step_height);
561
562 const void** indirection_buffer = (const void**) realloc(average_pooling_op->indirection_buffer, indirection_buffer_size);
563 if (indirection_buffer == NULL) {
564 xnn_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
565 return xnn_status_out_of_memory;
566 }
567 average_pooling_op->indirection_buffer = indirection_buffer;
568
569 xnn_indirection_init_dwconv2d(
570 average_pooling_op, valid_batch_size, step_height, step_width, 2 /* log2(sizeof(float)) */);
571
572 const size_t channels = average_pooling_op->channels;
573
574 const size_t indirect_input_height_stride = step_height * sizeof(void*);
575 const size_t output_width_stride = average_pooling_op->output_pixel_stride * sizeof(float);
576 const size_t output_height_stride = output_width * output_width_stride;
577
578 switch (average_pooling_op->ukernel.type) {
579 case xnn_ukernel_type_average_pooling:
580 {
581 const uint32_t qr = xnn_params.f32.avgpool.qr;
582 const size_t multipass_adjustment =
583 pooling_size > mr ? round_up(pooling_size - mr, qr) + mr - qr : 0;
584 average_pooling_op->context.average_pooling = (struct average_pooling_context) {
585 .indirect_input = indirection_buffer,
586 .indirect_input_batch_stride = output_height * indirect_input_height_stride,
587 .indirect_input_height_stride = indirect_input_height_stride,
588 .output = output,
589 .output_batch_stride = output_height * output_height_stride,
590 .output_height_stride = output_height_stride,
591 .output_width = output_width,
592 .pooling_size = pooling_size,
593 .channels = channels,
594 .zero = average_pooling_op->zero_buffer,
595 .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
596 .output_increment = output_width_stride - channels * sizeof(float),
597 .params.f32 = average_pooling_op->f32_avgpool_params,
598 };
599 if (pooling_size <= mr) {
600 average_pooling_op->context.average_pooling.unipass_ukernel = xnn_params.f32.avgpool.up;
601 average_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_average_pooling_unipass;
602 } else {
603 average_pooling_op->context.average_pooling.multipass_ukernel = xnn_params.f32.avgpool.mp;
604 average_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_average_pooling_multipass;
605 }
606 break;
607 }
608 case xnn_ukernel_type_pixelwise_average_pooling:
609 {
610 if (input_height != average_pooling_op->last_input_height ||
611 input_width != average_pooling_op->last_input_width)
612 {
613 const size_t pixelwise_buffer_size = output_height * output_width * sizeof(float);
614 float* pixelwise_buffer = (float*) realloc(average_pooling_op->pixelwise_buffer, pixelwise_buffer_size);
615 if (pixelwise_buffer == NULL) {
616 xnn_log_error("failed to allocate %zu bytes for pixelwise buffer", pixelwise_buffer_size);
617 return xnn_status_out_of_memory;
618 }
619 average_pooling_op->pixelwise_buffer = pixelwise_buffer;
620
621 float* pixelwise_pointer = pixelwise_buffer;
622 for (size_t output_y = 0; output_y < output_height; output_y++) {
623 const size_t input_y_start = doz(output_y * average_pooling_op->stride_height, average_pooling_op->padding_top);
624 const size_t input_y_end =
625 min(doz(output_y * average_pooling_op->stride_height + average_pooling_op->kernel_height, average_pooling_op->padding_top), input_height);
626 const uint32_t input_y_range = (uint32_t) (input_y_end - input_y_start);
627 for (size_t output_x = 0; output_x < output_width; output_x++) {
628 const size_t input_x_start = doz(output_x * average_pooling_op->stride_width, average_pooling_op->padding_left);
629 const size_t input_x_end =
630 min(doz(output_x * average_pooling_op->stride_width + average_pooling_op->kernel_width, average_pooling_op->padding_left), input_width);
631 const uint32_t input_x_range = (uint32_t) (input_x_end - input_x_start);
632 *pixelwise_pointer++ = 1.0f / ((float) (int32_t) (input_y_range * input_x_range));
633 }
634 }
635 }
636
637 const uint32_t qr = xnn_params.f32.pavgpool.qr;
638 const size_t multipass_adjustment =
639 pooling_size > mr ? round_up(pooling_size - mr, qr) + mr - qr : 0;
640 average_pooling_op->context.pixelwise_average_pooling = (struct pixelwise_average_pooling_context) {
641 .indirect_input = indirection_buffer,
642 .indirect_input_batch_stride = output_height * indirect_input_height_stride,
643 .indirect_input_height_stride = indirect_input_height_stride,
644 .pixelwise_buffer = average_pooling_op->pixelwise_buffer,
645 .pixelwise_buffer_height_stride = output_width * sizeof(float),
646 .output = output,
647 .output_batch_stride = output_height * output_height_stride,
648 .output_height_stride = output_height_stride,
649 .output_width = output_width,
650 .pooling_size = pooling_size,
651 .channels = channels,
652 .zero = average_pooling_op->zero_buffer,
653 .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
654 .output_increment = output_width_stride - channels * sizeof(float),
655 .params.f32 = average_pooling_op->f32_output_params,
656 };
657 if (pooling_size <= mr) {
658 average_pooling_op->context.pixelwise_average_pooling.unipass_ukernel = xnn_params.f32.pavgpool.up;
659 average_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_pixelwise_average_pooling_unipass;
660 } else {
661 average_pooling_op->context.pixelwise_average_pooling.multipass_ukernel = xnn_params.f32.pavgpool.mp;
662 average_pooling_op->compute.task_2d = (pthreadpool_task_2d_t) xnn_compute_pixelwise_average_pooling_multipass;
663 }
664 break;
665 }
666 default:
667 XNN_UNREACHABLE;
668 }
669 average_pooling_op->compute.type = xnn_parallelization_type_2d;
670 average_pooling_op->compute.range[0] = batch_size;
671 average_pooling_op->compute.range[1] = output_height;
672 average_pooling_op->state = xnn_run_state_ready;
673
674 average_pooling_op->last_input = input;
675 average_pooling_op->last_input_height = input_height;
676 average_pooling_op->last_input_width = input_width;
677 average_pooling_op->valid_batch_size = max(valid_batch_size, batch_size);
678
679 return xnn_status_success;
680}