blob: d6c22c175ad07a5ee1f05a1346930d1fc3418399 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stddef.h>
10
11#include <fxdiv.h>
12
13#include <xnnpack/indirection.h>
14#include <xnnpack/operator.h>
15#include <xnnpack/math.h>
16
17
18void xnn_indirection_init_conv2d(
19 xnn_operator_t op,
20 size_t output_tile_size,
21 uint32_t log2_element_size)
22{
23 const void** indirection_buffer = op->indirection_buffer;
24 const void* input = op->input;
25 const void* zero = op->zero_buffer;
26 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
27 const size_t input_height = op->input_height;
28 const size_t input_width = op->input_width;
29 const size_t output_height = op->output_height;
30 const size_t output_width = op->output_width;
31 const size_t kernel_height = op->kernel_height;
32 const size_t kernel_width = op->kernel_width;
33 const size_t stride_height = op->stride_height;
34 const size_t stride_width = op->stride_width;
35 const size_t dilation_height = op->dilation_height;
36 const size_t dilation_width = op->dilation_width;
37 const size_t input_padding_top = op->padding_top;
38 const size_t input_padding_left = op->padding_left;
39
40 const size_t output_size = output_height * output_width;
41 const size_t tiled_output_size = round_up(output_size, output_tile_size);
42 const size_t kernel_size = kernel_height * kernel_width;
43
44 const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
45
46 for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
47 for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
48 const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1);
49 const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
50 const size_t output_x = output_y_x.remainder;
51 const size_t output_y = output_y_x.quotient;
52 for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
53 const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
54 if (input_y < input_height) {
55 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
56 const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
57 const size_t kernel_index = kernel_y * kernel_width + kernel_x;
58 const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
59 if (input_x < input_width) {
60 indirection_buffer[index] = (const void*)
61 ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
62 } else {
63 indirection_buffer[index] = zero;
64 }
65 }
66 } else {
67 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
68 const size_t kernel_index = kernel_y * kernel_width + kernel_x;
69 const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
70 indirection_buffer[index] = zero;
71 }
72 }
73 }
74 }
75 }
76}
77
78void xnn_indirection_init_dwconv2d(
79 xnn_operator_t op,
80 size_t batch_start,
81 size_t step_height,
82 size_t step_width,
83 uint32_t log2_element_size)
84{
85 const void** indirection_buffer = op->indirection_buffer;
86 const void* input = op->input;
87 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
88 const void* zero = op->zero_buffer;
89 const size_t batch_size = op->batch_size;
90 const size_t input_height = op->input_height;
91 const size_t input_width = op->input_width;
92 const size_t output_height = op->output_height;
93 const size_t output_width = op->output_width;
94 const size_t kernel_height = op->kernel_height;
95 const size_t kernel_width = op->kernel_width;
96 const size_t stride_height = op->stride_height;
97 const size_t stride_width = op->stride_width;
98 const size_t dilation_height = op->dilation_height;
99 const size_t dilation_width = op->dilation_width;
100 const size_t input_padding_top = op->padding_top;
101 const size_t input_padding_left = op->padding_left;
102
103 for (size_t batch_index = batch_start; batch_index < batch_size; batch_index++) {
104 for (size_t output_y = 0; output_y < output_height; output_y++) {
105 for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
106 const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
107 if (input_y < input_height) {
108 for (size_t output_x = 0; output_x < output_width; output_x++) {
109 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
110 const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
111 const size_t index = (batch_index * output_height + output_y) * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
112 if (input_x < input_width) {
113 indirection_buffer[index] =
114 (const void*) ((uintptr_t) input + ((batch_index * input_height + input_y) * input_width + input_x) * input_pixel_stride);
115 } else {
116 indirection_buffer[index] = zero;
117 }
118 }
119 }
120 } else {
121 for (size_t output_x = 0; output_x < output_width; output_x++) {
122 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
123 const size_t index = (batch_index * output_height + output_y) * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
124 indirection_buffer[index] = zero;
125 }
126 }
127 }
128 }
129 }
130 }
131}
132
133void xnn_indirection_init_deconv2d(
134 xnn_operator_t op,
135 size_t output_tile_size,
136 uint32_t log2_element_size)
137{
138 const void** indirection_buffer = op->indirection_buffer;
139 const void* input = op->input;
140 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
141 const void* zero = op->zero_buffer;
142 const size_t input_height = op->input_height;
143 const size_t input_width = op->input_width;
144 const size_t output_height = op->output_height;
145 const size_t output_width = op->output_width;
146 const size_t kernel_height = op->kernel_height;
147 const size_t kernel_width = op->kernel_width;
148 const size_t stride_height = op->stride_height;
149 const size_t stride_width = op->stride_width;
150 const size_t dilation_height = op->dilation_height;
151 const size_t dilation_width = op->dilation_width;
152 const size_t padding_top = op->padding_top;
153 const size_t padding_left = op->padding_left;
154
155 const size_t output_size = output_height * output_width;
156 const size_t tiled_output_size = round_up(output_size, output_tile_size);
157 const size_t kernel_size = kernel_height * kernel_width;
158
159 const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
160 const struct fxdiv_divisor_size_t stride_height_divisor = fxdiv_init_size_t(stride_height);
161 const struct fxdiv_divisor_size_t stride_width_divisor = fxdiv_init_size_t(stride_width);
162
163 for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
164 for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
165 const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1);
166 const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
167 const size_t output_x = output_y_x.remainder;
168 const size_t output_y = output_y_x.quotient;
169 for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
170 const size_t y = output_y + padding_top - kernel_y * dilation_height;
171 const size_t input_y = fxdiv_quotient_size_t(y, stride_height_divisor);
172 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
173 const size_t x = output_x + padding_left - kernel_x * dilation_width;
174 const size_t input_x = fxdiv_quotient_size_t(x, stride_width_divisor);
175 const size_t kernel_index = kernel_y * kernel_width + kernel_x;
176 const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
177 if (input_y * stride_height == y && input_y < input_height && input_x * stride_width == x && input_x < input_width) {
178 indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
179 } else {
180 indirection_buffer[index] = zero;
181 }
182 }
183 }
184 }
185 }
186}
187
188void xnn_indirection_init_subconv2d(
189 xnn_operator_t op,
190 size_t output_tile_size,
191 uint32_t log2_element_size)
192{
193 const void** indirection_buffer = op->indirection_buffer;
194 struct subconvolution_params* subconvolution_params = op->subconvolution_buffer;
195 const void* input = op->input;
196 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
197 const void* zero = op->zero_buffer;
198 const size_t input_height = op->input_height;
199 const size_t input_width = op->input_width;
200 const size_t output_height = op->output_height;
201 const size_t output_width = op->output_width;
202 const size_t kernel_height = op->kernel_height;
203 const size_t kernel_width = op->kernel_width;
204 const size_t stride_height = op->stride_height;
205 const size_t stride_width = op->stride_width;
206 const size_t padding_top = op->padding_top;
207 const size_t padding_left = op->padding_left;
208
209 const size_t modulo_padding_top = padding_top % stride_height;
210 const size_t modulo_padding_left = padding_left % stride_width;
211 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
212 const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
213 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
214 const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
215 const size_t sliced_output_width = divide_round_up(output_width - output_x_start, stride_width);
216
217 subconvolution_params->indirection_buffer = indirection_buffer;
218 subconvolution_params->indirection_y_stride =
219 subconvolution_params->indirection_x_stride * round_up(sliced_output_width, output_tile_size);
220 ++subconvolution_params;
221
222 for (size_t output_y = output_y_start; output_y < output_height; output_y += stride_height) {
223 for (size_t output_tile_start = 0; output_tile_start < sliced_output_width; output_tile_start += output_tile_size) {
224 for (size_t kernel_y = offset_y; kernel_y < kernel_height; kernel_y += stride_height) {
225 assert(doz(output_y + padding_top, kernel_y) % stride_height == 0);
226 const size_t y = output_y + padding_top - kernel_y;
227 const size_t input_y = y / stride_height;
228
229 for (size_t kernel_x = offset_x; kernel_x < kernel_width; kernel_x += stride_width) {
230 for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
231 const size_t sliced_output_x = min(output_tile_start + output_tile_offset, sliced_output_width - 1);
232 const size_t output_x = output_x_start + sliced_output_x * stride_width;
233
234 assert(doz(output_x + padding_left, kernel_x) % stride_width == 0);
235 const size_t x = output_x + padding_left - kernel_x;
236 const size_t input_x = x / stride_width;
237
238 if (input_y < input_height && input_x < input_width) {
239 *indirection_buffer++ =
240 (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
241 } else {
242 *indirection_buffer++ = zero;
243 }
244 }
245 }
246 }
247 }
248 }
249 }
250 }
251}
252
253void xnn_indirection_init_maxpool2d(
254 xnn_operator_t op,
XNNPACK Teamb455b122019-09-27 18:10:33 -0700255 size_t step_height,
256 size_t step_width,
257 uint32_t log2_element_size)
258{
259 const void** indirection_buffer = op->indirection_buffer;
260 const void* input = op->input;
261 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
XNNPACK Teamb455b122019-09-27 18:10:33 -0700262 const size_t input_height = op->input_height;
263 const size_t input_width = op->input_width;
264 const size_t output_height = op->output_height;
265 const size_t output_width = op->output_width;
266 const size_t pooling_height = op->kernel_height;
267 const size_t pooling_width = op->kernel_width;
268 const size_t stride_height = op->stride_height;
269 const size_t stride_width = op->stride_width;
270 const size_t dilation_height = op->dilation_height;
271 const size_t dilation_width = op->dilation_width;
272 const size_t input_padding_top = op->padding_top;
273 const size_t input_padding_left = op->padding_left;
274
Marat Dukhanc58bd342020-03-19 18:53:05 -0700275 const bool any_dilation = (dilation_height | dilation_width) > 1;
276
277 if (any_dilation) {
278 // Clamp to the border doesn't work for pooling with dilation.
279 const size_t adjusted_padding_top = input_padding_top % dilation_height;
280 const size_t adjusted_padding_left = input_padding_left % dilation_width;
281 for (size_t output_y = 0; output_y < output_height; output_y++) {
282 for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
283 size_t safe_input_y = output_y * stride_height;
284 if XNN_UNPREDICTABLE(safe_input_y < adjusted_padding_top) {
285 safe_input_y += dilation_height;
286 }
287 safe_input_y -= adjusted_padding_top;
288
289 size_t input_y = output_y * stride_height + pooling_y * dilation_height - input_padding_top;
290 if XNN_UNPREDICTABLE(input_y >= input_height) {
291 input_y = safe_input_y;
292 }
293
294 for (size_t output_x = 0; output_x < output_width; output_x++) {
295 for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
296 size_t safe_input_x = output_x * stride_width;
297 if XNN_UNPREDICTABLE(safe_input_x < adjusted_padding_left) {
298 safe_input_x += dilation_width;
299 }
300 safe_input_x -= adjusted_padding_left;
301
302 size_t input_x = output_x * stride_width + pooling_x * dilation_width - input_padding_left;
303 if XNN_UNPREDICTABLE(input_x >= input_width) {
304 input_x = safe_input_x;
305 }
306
307 const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
Marat Dukhanbdc80992020-04-13 01:21:18 -0700308 indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
Marat Dukhanc58bd342020-03-19 18:53:05 -0700309 }
310 }
311 }
312 }
313 } else {
314 const size_t input_x_max = input_width - 1;
315 const size_t input_y_max = input_height - 1;
316 for (size_t output_y = 0; output_y < output_height; output_y++) {
317 for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
318 const size_t input_y = min(doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top), input_y_max);
319 for (size_t output_x = 0; output_x < output_width; output_x++) {
320 for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
321 const size_t input_x = min(doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left), input_x_max);
322 const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
Marat Dukhanbdc80992020-04-13 01:21:18 -0700323 indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
Marat Dukhanc58bd342020-03-19 18:53:05 -0700324 }
XNNPACK Teamb455b122019-09-27 18:10:33 -0700325 }
326 }
327 }
328 }
329}
330
Marat Dukhan69722492019-11-11 19:55:50 -0800331void xnn_indirection_init_resize_bilinear2d_f32(
332 size_t input_pixel_stride,
333 size_t input_height,
334 size_t input_width,
335 size_t output_height,
336 size_t output_width,
337 const void* input,
338 const void** indirection_buffer,
339 float* packed_weights,
340 bool align_corners,
341 bool tensorflow_legacy)
342{
343 assert(input_height != 0);
344 assert(input_height < 16777216 /* 2**24 */);
345 assert(input_width != 0);
346 assert(input_width < 16777216 /* 2**24 */);
347 assert(output_height != 0);
348 assert(output_height < 16777216 /* 2**24 */);
349 assert(output_width != 0);
350 assert(output_width < 16777216 /* 2**24 */);
351
352 const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
353 const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
354 const float width_scale =
355 (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
356 const float height_scale =
357 (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
358
359 const uint32_t input_y_max = (uint32_t) input_height - 1;
360 const uint32_t input_x_max = (uint32_t) input_width - 1;
Marat Dukhanf5c46252020-05-22 10:36:13 -0700361 if (tensorflow_legacy || align_corners) {
Marat Dukhan69722492019-11-11 19:55:50 -0800362 for (size_t output_y = 0; output_y < output_height; output_y++) {
363 const float input_y = (float) (int32_t) output_y * height_scale;
364 assert(input_y >= 0.0f);
365 assert(input_y < (float) input_height);
366
367 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
368 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
369 const float alpha_y = input_y - (float) input_y_top;
370 for (size_t output_x = 0; output_x < output_width; output_x++) {
371 const float input_x = (float) (int32_t) output_x * width_scale;
372 assert(input_x >= 0.0f);
373 assert(input_x < (float) input_width);
374
375 const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
376 const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
377 const float alpha_x = input_x - (float) input_x_left;
378 indirection_buffer[0] =
379 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
380 indirection_buffer[1] =
381 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
382 indirection_buffer[2] =
383 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
384 indirection_buffer[3] =
385 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
386 packed_weights[0] = alpha_x;
387 packed_weights[1] = alpha_y;
388 indirection_buffer += 4;
389 packed_weights += 2;
390 }
391 }
392 } else {
393 const float height_offset = 0.5f * height_scale - 0.5f;
394 const float width_offset = 0.5f * width_scale - 0.5f;
395 for (size_t output_y = 0; output_y < output_height; output_y++) {
396 float input_y = (float) (int32_t) output_y * height_scale + height_offset;
397 input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
398 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
399 assert((int32_t) input_y_top >= 0);
400 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
401 const float alpha_y = input_y - (float) input_y_top;
402 for (size_t output_x = 0; output_x < output_width; output_x++) {
403 float input_x = (float) (int32_t) output_x * width_scale + width_offset;
404 input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
405 const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
406 assert((int32_t) input_x_left >= 0);
407 const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
408 const float alpha_x = input_x - (float) input_x_left;
409 indirection_buffer[0] =
410 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
411 indirection_buffer[1] =
412 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
413 indirection_buffer[2] =
414 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
415 indirection_buffer[3] =
416 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
417 packed_weights[0] = alpha_x;
418 packed_weights[1] = alpha_y;
419 indirection_buffer += 4;
420 packed_weights += 2;
421 }
422 }
423 }
424}
425
XNNPACK Teamb455b122019-09-27 18:10:33 -0700426void xnn_indirection_init_unpool2d(
427 xnn_operator_t op,
428 size_t batch_start,
429 uint32_t log2_element_size)
430{
431 const void** indirection_buffer = op->indirection_buffer;
432 const void* output = op->output;
433 const size_t output_pixel_stride = op->output_pixel_stride << log2_element_size;
434 const size_t batch_size = op->batch_size;
435 const size_t input_height = op->input_height;
436 const size_t input_width = op->input_width;
437 const size_t output_height = op->output_height;
438 const size_t output_width = op->output_width;
439 const size_t pooling_height = op->kernel_height;
440 const size_t pooling_width = op->kernel_width;
441 const size_t output_padding_top = op->padding_top;
442 const size_t output_padding_left = op->padding_left;
443
444 for (size_t image = batch_start; image < batch_size; image++) {
445 for (size_t input_y = 0; input_y < input_height; input_y++) {
446 for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
447 const size_t output_y = min(doz(input_y * pooling_height + pooling_y, output_padding_top), output_height - 1);
448 for (size_t input_x = 0; input_x < input_width; input_x++) {
449 for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
450 const size_t output_x = min(doz(input_x * pooling_width + pooling_x, output_padding_left), output_width - 1);
451 indirection_buffer[(((image * input_height + input_y) * input_width + input_x) * pooling_width + pooling_x) * pooling_height + pooling_y] =
Marat Dukhanbdc80992020-04-13 01:21:18 -0700452 (const void*) ((uintptr_t) output + ((image * output_height + output_y) * output_width + output_x) * output_pixel_stride);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700453 }
454 }
455 }
456 }
457 }
458}