XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 1 | // Copyright (c) Facebook, Inc. and its affiliates. |
| 2 | // All rights reserved. |
| 3 | // |
| 4 | // Copyright 2019 Google LLC |
| 5 | // |
| 6 | // This source code is licensed under the BSD-style license found in the |
| 7 | // LICENSE file in the root directory of this source tree. |
| 8 | |
| 9 | #include <stddef.h> |
Marat Dukhan | 0ab7553 | 2021-11-24 16:50:30 -0800 | [diff] [blame] | 10 | #include <math.h> |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 11 | |
| 12 | #include <fxdiv.h> |
| 13 | |
| 14 | #include <xnnpack/indirection.h> |
| 15 | #include <xnnpack/operator.h> |
| 16 | #include <xnnpack/math.h> |
| 17 | |
| 18 | |
| 19 | void xnn_indirection_init_conv2d( |
| 20 | xnn_operator_t op, |
| 21 | size_t output_tile_size, |
| 22 | uint32_t log2_element_size) |
| 23 | { |
| 24 | const void** indirection_buffer = op->indirection_buffer; |
| 25 | const void* input = op->input; |
| 26 | const void* zero = op->zero_buffer; |
| 27 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; |
| 28 | const size_t input_height = op->input_height; |
| 29 | const size_t input_width = op->input_width; |
| 30 | const size_t output_height = op->output_height; |
| 31 | const size_t output_width = op->output_width; |
| 32 | const size_t kernel_height = op->kernel_height; |
| 33 | const size_t kernel_width = op->kernel_width; |
| 34 | const size_t stride_height = op->stride_height; |
| 35 | const size_t stride_width = op->stride_width; |
| 36 | const size_t dilation_height = op->dilation_height; |
| 37 | const size_t dilation_width = op->dilation_width; |
| 38 | const size_t input_padding_top = op->padding_top; |
| 39 | const size_t input_padding_left = op->padding_left; |
| 40 | |
| 41 | const size_t output_size = output_height * output_width; |
| 42 | const size_t tiled_output_size = round_up(output_size, output_tile_size); |
| 43 | const size_t kernel_size = kernel_height * kernel_width; |
| 44 | |
| 45 | const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width); |
| 46 | |
| 47 | for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) { |
| 48 | for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) { |
| 49 | const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1); |
| 50 | const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor); |
| 51 | const size_t output_x = output_y_x.remainder; |
| 52 | const size_t output_y = output_y_x.quotient; |
| 53 | for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) { |
| 54 | const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top; |
| 55 | if (input_y < input_height) { |
| 56 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { |
| 57 | const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left; |
| 58 | const size_t kernel_index = kernel_y * kernel_width + kernel_x; |
| 59 | const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset; |
| 60 | if (input_x < input_width) { |
| 61 | indirection_buffer[index] = (const void*) |
| 62 | ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); |
| 63 | } else { |
| 64 | indirection_buffer[index] = zero; |
| 65 | } |
| 66 | } |
| 67 | } else { |
| 68 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { |
| 69 | const size_t kernel_index = kernel_y * kernel_width + kernel_x; |
| 70 | const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset; |
| 71 | indirection_buffer[index] = zero; |
| 72 | } |
| 73 | } |
| 74 | } |
| 75 | } |
| 76 | } |
| 77 | } |
| 78 | |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 79 | void xnn_indirection_init_deconv2d( |
| 80 | xnn_operator_t op, |
| 81 | size_t output_tile_size, |
| 82 | uint32_t log2_element_size) |
| 83 | { |
| 84 | const void** indirection_buffer = op->indirection_buffer; |
| 85 | const void* input = op->input; |
| 86 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; |
| 87 | const void* zero = op->zero_buffer; |
| 88 | const size_t input_height = op->input_height; |
| 89 | const size_t input_width = op->input_width; |
| 90 | const size_t output_height = op->output_height; |
| 91 | const size_t output_width = op->output_width; |
| 92 | const size_t kernel_height = op->kernel_height; |
| 93 | const size_t kernel_width = op->kernel_width; |
| 94 | const size_t stride_height = op->stride_height; |
| 95 | const size_t stride_width = op->stride_width; |
| 96 | const size_t dilation_height = op->dilation_height; |
| 97 | const size_t dilation_width = op->dilation_width; |
| 98 | const size_t padding_top = op->padding_top; |
| 99 | const size_t padding_left = op->padding_left; |
| 100 | |
| 101 | const size_t output_size = output_height * output_width; |
| 102 | const size_t tiled_output_size = round_up(output_size, output_tile_size); |
| 103 | const size_t kernel_size = kernel_height * kernel_width; |
| 104 | |
| 105 | const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width); |
| 106 | const struct fxdiv_divisor_size_t stride_height_divisor = fxdiv_init_size_t(stride_height); |
| 107 | const struct fxdiv_divisor_size_t stride_width_divisor = fxdiv_init_size_t(stride_width); |
| 108 | |
| 109 | for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) { |
| 110 | for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) { |
| 111 | const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1); |
| 112 | const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor); |
| 113 | const size_t output_x = output_y_x.remainder; |
| 114 | const size_t output_y = output_y_x.quotient; |
| 115 | for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) { |
| 116 | const size_t y = output_y + padding_top - kernel_y * dilation_height; |
| 117 | const size_t input_y = fxdiv_quotient_size_t(y, stride_height_divisor); |
| 118 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { |
| 119 | const size_t x = output_x + padding_left - kernel_x * dilation_width; |
| 120 | const size_t input_x = fxdiv_quotient_size_t(x, stride_width_divisor); |
| 121 | const size_t kernel_index = kernel_y * kernel_width + kernel_x; |
| 122 | const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset; |
| 123 | if (input_y * stride_height == y && input_y < input_height && input_x * stride_width == x && input_x < input_width) { |
| 124 | indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); |
| 125 | } else { |
| 126 | indirection_buffer[index] = zero; |
| 127 | } |
| 128 | } |
| 129 | } |
| 130 | } |
| 131 | } |
| 132 | } |
| 133 | |
| 134 | void xnn_indirection_init_subconv2d( |
| 135 | xnn_operator_t op, |
| 136 | size_t output_tile_size, |
| 137 | uint32_t log2_element_size) |
| 138 | { |
| 139 | const void** indirection_buffer = op->indirection_buffer; |
| 140 | struct subconvolution_params* subconvolution_params = op->subconvolution_buffer; |
| 141 | const void* input = op->input; |
| 142 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; |
| 143 | const void* zero = op->zero_buffer; |
| 144 | const size_t input_height = op->input_height; |
| 145 | const size_t input_width = op->input_width; |
| 146 | const size_t output_height = op->output_height; |
| 147 | const size_t output_width = op->output_width; |
| 148 | const size_t kernel_height = op->kernel_height; |
| 149 | const size_t kernel_width = op->kernel_width; |
| 150 | const size_t stride_height = op->stride_height; |
| 151 | const size_t stride_width = op->stride_width; |
| 152 | const size_t padding_top = op->padding_top; |
| 153 | const size_t padding_left = op->padding_left; |
| 154 | |
| 155 | const size_t modulo_padding_top = padding_top % stride_height; |
| 156 | const size_t modulo_padding_left = padding_left % stride_width; |
| 157 | for (size_t offset_y = 0; offset_y < stride_height; offset_y++) { |
| 158 | const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height); |
| 159 | for (size_t offset_x = 0; offset_x < stride_width; offset_x++) { |
| 160 | const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width); |
| 161 | const size_t sliced_output_width = divide_round_up(output_width - output_x_start, stride_width); |
| 162 | |
| 163 | subconvolution_params->indirection_buffer = indirection_buffer; |
| 164 | subconvolution_params->indirection_y_stride = |
| 165 | subconvolution_params->indirection_x_stride * round_up(sliced_output_width, output_tile_size); |
| 166 | ++subconvolution_params; |
| 167 | |
| 168 | for (size_t output_y = output_y_start; output_y < output_height; output_y += stride_height) { |
| 169 | for (size_t output_tile_start = 0; output_tile_start < sliced_output_width; output_tile_start += output_tile_size) { |
| 170 | for (size_t kernel_y = offset_y; kernel_y < kernel_height; kernel_y += stride_height) { |
| 171 | assert(doz(output_y + padding_top, kernel_y) % stride_height == 0); |
| 172 | const size_t y = output_y + padding_top - kernel_y; |
| 173 | const size_t input_y = y / stride_height; |
| 174 | |
| 175 | for (size_t kernel_x = offset_x; kernel_x < kernel_width; kernel_x += stride_width) { |
| 176 | for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) { |
| 177 | const size_t sliced_output_x = min(output_tile_start + output_tile_offset, sliced_output_width - 1); |
| 178 | const size_t output_x = output_x_start + sliced_output_x * stride_width; |
| 179 | |
| 180 | assert(doz(output_x + padding_left, kernel_x) % stride_width == 0); |
| 181 | const size_t x = output_x + padding_left - kernel_x; |
| 182 | const size_t input_x = x / stride_width; |
| 183 | |
| 184 | if (input_y < input_height && input_x < input_width) { |
| 185 | *indirection_buffer++ = |
| 186 | (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); |
| 187 | } else { |
| 188 | *indirection_buffer++ = zero; |
| 189 | } |
| 190 | } |
| 191 | } |
| 192 | } |
| 193 | } |
| 194 | } |
| 195 | } |
| 196 | } |
| 197 | } |
| 198 | |
Marat Dukhan | c79427c | 2020-10-15 09:04:21 -0700 | [diff] [blame] | 199 | void xnn_indirection_init_dwconv2d( |
| 200 | xnn_operator_t op, |
| 201 | size_t step_height, |
| 202 | size_t step_width, |
| 203 | uint32_t log2_element_size) |
| 204 | { |
| 205 | const void** indirection_buffer = op->indirection_buffer; |
| 206 | const void* input = op->input; |
| 207 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; |
| 208 | const void* zero = op->zero_buffer; |
| 209 | const size_t input_height = op->input_height; |
| 210 | const size_t input_width = op->input_width; |
| 211 | const size_t output_height = op->output_height; |
| 212 | const size_t output_width = op->output_width; |
| 213 | const size_t kernel_height = op->kernel_height; |
| 214 | const size_t kernel_width = op->kernel_width; |
| 215 | const size_t stride_height = op->stride_height; |
| 216 | const size_t stride_width = op->stride_width; |
| 217 | const size_t dilation_height = op->dilation_height; |
| 218 | const size_t dilation_width = op->dilation_width; |
| 219 | const size_t input_padding_top = op->padding_top; |
| 220 | const size_t input_padding_left = op->padding_left; |
| 221 | |
| 222 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
| 223 | for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) { |
| 224 | const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top; |
| 225 | if (input_y < input_height) { |
| 226 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
| 227 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { |
| 228 | const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left; |
| 229 | const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y; |
| 230 | if (input_x < input_width) { |
| 231 | indirection_buffer[index] = |
| 232 | (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); |
| 233 | } else { |
| 234 | indirection_buffer[index] = zero; |
| 235 | } |
| 236 | } |
| 237 | } |
| 238 | } else { |
| 239 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
| 240 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { |
| 241 | const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y; |
| 242 | indirection_buffer[index] = zero; |
| 243 | } |
| 244 | } |
| 245 | } |
| 246 | } |
| 247 | } |
| 248 | } |
| 249 | |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 250 | void xnn_indirection_init_maxpool2d( |
| 251 | xnn_operator_t op, |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 252 | size_t step_height, |
| 253 | size_t step_width, |
| 254 | uint32_t log2_element_size) |
| 255 | { |
| 256 | const void** indirection_buffer = op->indirection_buffer; |
| 257 | const void* input = op->input; |
| 258 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 259 | const size_t input_height = op->input_height; |
| 260 | const size_t input_width = op->input_width; |
| 261 | const size_t output_height = op->output_height; |
| 262 | const size_t output_width = op->output_width; |
| 263 | const size_t pooling_height = op->kernel_height; |
| 264 | const size_t pooling_width = op->kernel_width; |
| 265 | const size_t stride_height = op->stride_height; |
| 266 | const size_t stride_width = op->stride_width; |
| 267 | const size_t dilation_height = op->dilation_height; |
| 268 | const size_t dilation_width = op->dilation_width; |
| 269 | const size_t input_padding_top = op->padding_top; |
| 270 | const size_t input_padding_left = op->padding_left; |
| 271 | |
Marat Dukhan | c58bd34 | 2020-03-19 18:53:05 -0700 | [diff] [blame] | 272 | const bool any_dilation = (dilation_height | dilation_width) > 1; |
| 273 | |
| 274 | if (any_dilation) { |
| 275 | // Clamp to the border doesn't work for pooling with dilation. |
| 276 | const size_t adjusted_padding_top = input_padding_top % dilation_height; |
| 277 | const size_t adjusted_padding_left = input_padding_left % dilation_width; |
| 278 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
| 279 | for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) { |
| 280 | size_t safe_input_y = output_y * stride_height; |
| 281 | if XNN_UNPREDICTABLE(safe_input_y < adjusted_padding_top) { |
| 282 | safe_input_y += dilation_height; |
| 283 | } |
| 284 | safe_input_y -= adjusted_padding_top; |
| 285 | |
| 286 | size_t input_y = output_y * stride_height + pooling_y * dilation_height - input_padding_top; |
| 287 | if XNN_UNPREDICTABLE(input_y >= input_height) { |
| 288 | input_y = safe_input_y; |
| 289 | } |
| 290 | |
| 291 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
| 292 | for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) { |
| 293 | size_t safe_input_x = output_x * stride_width; |
| 294 | if XNN_UNPREDICTABLE(safe_input_x < adjusted_padding_left) { |
| 295 | safe_input_x += dilation_width; |
| 296 | } |
| 297 | safe_input_x -= adjusted_padding_left; |
| 298 | |
| 299 | size_t input_x = output_x * stride_width + pooling_x * dilation_width - input_padding_left; |
| 300 | if XNN_UNPREDICTABLE(input_x >= input_width) { |
| 301 | input_x = safe_input_x; |
| 302 | } |
| 303 | |
| 304 | const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y; |
Marat Dukhan | bdc8099 | 2020-04-13 01:21:18 -0700 | [diff] [blame] | 305 | indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); |
Marat Dukhan | c58bd34 | 2020-03-19 18:53:05 -0700 | [diff] [blame] | 306 | } |
| 307 | } |
| 308 | } |
| 309 | } |
| 310 | } else { |
| 311 | const size_t input_x_max = input_width - 1; |
| 312 | const size_t input_y_max = input_height - 1; |
| 313 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
| 314 | for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) { |
| 315 | const size_t input_y = min(doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top), input_y_max); |
| 316 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
| 317 | for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) { |
| 318 | const size_t input_x = min(doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left), input_x_max); |
| 319 | const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y; |
Marat Dukhan | bdc8099 | 2020-04-13 01:21:18 -0700 | [diff] [blame] | 320 | indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); |
Marat Dukhan | c58bd34 | 2020-03-19 18:53:05 -0700 | [diff] [blame] | 321 | } |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 322 | } |
| 323 | } |
| 324 | } |
| 325 | } |
| 326 | } |
| 327 | |
Artsiom Ablavatski | 9791810 | 2020-10-27 15:52:59 -0700 | [diff] [blame] | 328 | void xnn_indirection_init_resize_bilinear2d_hwc_f32( |
Marat Dukhan | 6972249 | 2019-11-11 19:55:50 -0800 | [diff] [blame] | 329 | size_t input_pixel_stride, |
| 330 | size_t input_height, |
| 331 | size_t input_width, |
| 332 | size_t output_height, |
| 333 | size_t output_width, |
| 334 | const void* input, |
| 335 | const void** indirection_buffer, |
| 336 | float* packed_weights, |
| 337 | bool align_corners, |
| 338 | bool tensorflow_legacy) |
| 339 | { |
| 340 | assert(input_height != 0); |
| 341 | assert(input_height < 16777216 /* 2**24 */); |
| 342 | assert(input_width != 0); |
| 343 | assert(input_width < 16777216 /* 2**24 */); |
| 344 | assert(output_height != 0); |
| 345 | assert(output_height < 16777216 /* 2**24 */); |
| 346 | assert(output_width != 0); |
| 347 | assert(output_width < 16777216 /* 2**24 */); |
| 348 | |
| 349 | const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1); |
| 350 | const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1); |
| 351 | const float width_scale = |
| 352 | (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment); |
| 353 | const float height_scale = |
| 354 | (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment); |
| 355 | |
| 356 | const uint32_t input_y_max = (uint32_t) input_height - 1; |
| 357 | const uint32_t input_x_max = (uint32_t) input_width - 1; |
Marat Dukhan | f5c4625 | 2020-05-22 10:36:13 -0700 | [diff] [blame] | 358 | if (tensorflow_legacy || align_corners) { |
Marat Dukhan | 6972249 | 2019-11-11 19:55:50 -0800 | [diff] [blame] | 359 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
| 360 | const float input_y = (float) (int32_t) output_y * height_scale; |
| 361 | assert(input_y >= 0.0f); |
| 362 | assert(input_y < (float) input_height); |
| 363 | |
| 364 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
| 365 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
| 366 | const float alpha_y = input_y - (float) input_y_top; |
| 367 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
| 368 | const float input_x = (float) (int32_t) output_x * width_scale; |
| 369 | assert(input_x >= 0.0f); |
| 370 | assert(input_x < (float) input_width); |
| 371 | |
| 372 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
| 373 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); |
| 374 | const float alpha_x = input_x - (float) input_x_left; |
| 375 | indirection_buffer[0] = |
| 376 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
| 377 | indirection_buffer[1] = |
| 378 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); |
| 379 | indirection_buffer[2] = |
| 380 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
| 381 | indirection_buffer[3] = |
| 382 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); |
| 383 | packed_weights[0] = alpha_x; |
| 384 | packed_weights[1] = alpha_y; |
| 385 | indirection_buffer += 4; |
| 386 | packed_weights += 2; |
| 387 | } |
| 388 | } |
| 389 | } else { |
| 390 | const float height_offset = 0.5f * height_scale - 0.5f; |
| 391 | const float width_offset = 0.5f * width_scale - 0.5f; |
| 392 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
| 393 | float input_y = (float) (int32_t) output_y * height_scale + height_offset; |
| 394 | input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max); |
| 395 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
| 396 | assert((int32_t) input_y_top >= 0); |
| 397 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
| 398 | const float alpha_y = input_y - (float) input_y_top; |
| 399 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
| 400 | float input_x = (float) (int32_t) output_x * width_scale + width_offset; |
| 401 | input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max); |
| 402 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
| 403 | assert((int32_t) input_x_left >= 0); |
| 404 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); |
| 405 | const float alpha_x = input_x - (float) input_x_left; |
| 406 | indirection_buffer[0] = |
| 407 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
| 408 | indirection_buffer[1] = |
| 409 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); |
| 410 | indirection_buffer[2] = |
| 411 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
| 412 | indirection_buffer[3] = |
| 413 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); |
| 414 | packed_weights[0] = alpha_x; |
| 415 | packed_weights[1] = alpha_y; |
| 416 | indirection_buffer += 4; |
| 417 | packed_weights += 2; |
| 418 | } |
| 419 | } |
| 420 | } |
| 421 | } |
| 422 | |
Marat Dukhan | 0ab7553 | 2021-11-24 16:50:30 -0800 | [diff] [blame] | 423 | void xnn_indirection_init_resize_bilinear2d_hwc_q11( |
| 424 | size_t input_pixel_stride, |
| 425 | size_t input_height, |
| 426 | size_t input_width, |
| 427 | size_t output_height, |
| 428 | size_t output_width, |
| 429 | const void* input, |
| 430 | const void** indirection_buffer, |
| 431 | int16_t* packed_weights, |
| 432 | bool align_corners, |
| 433 | bool tensorflow_legacy) |
| 434 | { |
| 435 | assert(input_height != 0); |
| 436 | assert(input_height < 16777216 /* 2**24 */); |
| 437 | assert(input_width != 0); |
| 438 | assert(input_width < 16777216 /* 2**24 */); |
| 439 | assert(output_height != 0); |
| 440 | assert(output_height < 16777216 /* 2**24 */); |
| 441 | assert(output_width != 0); |
| 442 | assert(output_width < 16777216 /* 2**24 */); |
| 443 | |
| 444 | const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1); |
| 445 | const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1); |
| 446 | const float width_scale = |
| 447 | (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment); |
| 448 | const float height_scale = |
| 449 | (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment); |
| 450 | |
| 451 | const uint32_t input_y_max = (uint32_t) input_height - 1; |
| 452 | const uint32_t input_x_max = (uint32_t) input_width - 1; |
| 453 | if (tensorflow_legacy || align_corners) { |
| 454 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
| 455 | const float input_y = (float) (int32_t) output_y * height_scale; |
| 456 | assert(input_y >= 0.0f); |
| 457 | assert(input_y < (float) input_height); |
| 458 | |
| 459 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
| 460 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
| 461 | const float alpha_y = input_y - (float) input_y_top; |
| 462 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
| 463 | const float input_x = (float) (int32_t) output_x * width_scale; |
| 464 | assert(input_x >= 0.0f); |
| 465 | assert(input_x < (float) input_width); |
| 466 | |
| 467 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
| 468 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); |
| 469 | const float alpha_x = input_x - (float) input_x_left; |
| 470 | indirection_buffer[0] = |
| 471 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
| 472 | indirection_buffer[1] = |
| 473 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); |
| 474 | indirection_buffer[2] = |
| 475 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
| 476 | indirection_buffer[3] = |
| 477 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); |
| 478 | packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f); |
| 479 | packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f); |
| 480 | indirection_buffer += 4; |
| 481 | packed_weights += 2; |
| 482 | } |
| 483 | } |
| 484 | } else { |
| 485 | const float height_offset = 0.5f * height_scale - 0.5f; |
| 486 | const float width_offset = 0.5f * width_scale - 0.5f; |
| 487 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
| 488 | float input_y = (float) (int32_t) output_y * height_scale + height_offset; |
| 489 | input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max); |
| 490 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
| 491 | assert((int32_t) input_y_top >= 0); |
| 492 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
| 493 | const float alpha_y = input_y - (float) input_y_top; |
| 494 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
| 495 | float input_x = (float) (int32_t) output_x * width_scale + width_offset; |
| 496 | input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max); |
| 497 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
| 498 | assert((int32_t) input_x_left >= 0); |
| 499 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); |
| 500 | const float alpha_x = input_x - (float) input_x_left; |
| 501 | indirection_buffer[0] = |
| 502 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
| 503 | indirection_buffer[1] = |
| 504 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); |
| 505 | indirection_buffer[2] = |
| 506 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
| 507 | indirection_buffer[3] = |
| 508 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); |
| 509 | packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f); |
| 510 | packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f); |
| 511 | indirection_buffer += 4; |
| 512 | packed_weights += 2; |
| 513 | } |
| 514 | } |
| 515 | } |
| 516 | } |
| 517 | |
Artsiom Ablavatski | 9791810 | 2020-10-27 15:52:59 -0700 | [diff] [blame] | 518 | void xnn_indirection_init_resize_bilinear2d_chw_f32( |
| 519 | size_t input_pixel_stride, |
| 520 | size_t input_height, |
| 521 | size_t input_width, |
| 522 | size_t output_height, |
| 523 | size_t output_width, |
| 524 | const void* input, |
| 525 | const void** indirection_buffer, |
| 526 | float* packed_weights, |
| 527 | bool align_corners, |
| 528 | bool tensorflow_legacy) |
| 529 | { |
| 530 | assert(input_height > 1); |
| 531 | assert(input_height < 16777216 /* 2**24 */); |
| 532 | assert(input_width > 1); |
| 533 | assert(input_width < 16777216 /* 2**24 */); |
| 534 | assert(output_height != 0); |
| 535 | assert(output_height < 16777216 /* 2**24 */); |
| 536 | assert(output_width != 0); |
| 537 | assert(output_width < 16777216 /* 2**24 */); |
| 538 | |
| 539 | const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1); |
| 540 | const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1); |
| 541 | const float width_scale = |
| 542 | (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment); |
| 543 | const float height_scale = |
| 544 | (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment); |
| 545 | |
| 546 | const uint32_t input_y_max = (uint32_t) input_height - 1; |
| 547 | const uint32_t input_x_max = (uint32_t) input_width - 1; |
| 548 | if (tensorflow_legacy || align_corners) { |
| 549 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
| 550 | const float input_y = (float) (int32_t) output_y * height_scale; |
| 551 | assert(input_y >= 0.0f); |
| 552 | assert(input_y < (float) input_height); |
| 553 | |
| 554 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
| 555 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
| 556 | const float alpha_y = input_y - (float) input_y_top; |
| 557 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
| 558 | const float input_x = (float) (int32_t) output_x * width_scale; |
| 559 | assert(input_x >= 0.0f); |
| 560 | assert(input_x < (float) input_width); |
| 561 | |
| 562 | uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
| 563 | |
| 564 | float alpha_x = input_x - (float) input_x_left; |
| 565 | if (input_x_left == input_x_max) { |
| 566 | // Ensure that there is a pixel to the right of the one pointed at, |
| 567 | // as required by some CHW kernels. |
| 568 | --input_x_left; |
| 569 | alpha_x = 1.0f; |
| 570 | } |
| 571 | indirection_buffer[0] = |
| 572 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
| 573 | indirection_buffer[1] = |
| 574 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
| 575 | packed_weights[0] = alpha_x; |
| 576 | packed_weights[1] = alpha_y; |
| 577 | indirection_buffer += 2; |
| 578 | packed_weights += 2; |
| 579 | } |
| 580 | } |
| 581 | } else { |
| 582 | const float height_offset = 0.5f * height_scale - 0.5f; |
| 583 | const float width_offset = 0.5f * width_scale - 0.5f; |
| 584 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
| 585 | float input_y = (float) (int32_t) output_y * height_scale + height_offset; |
| 586 | input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max); |
| 587 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
| 588 | assert((int32_t) input_y_top >= 0); |
| 589 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
| 590 | const float alpha_y = input_y - (float) input_y_top; |
| 591 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
| 592 | float input_x = (float) (int32_t) output_x * width_scale + width_offset; |
| 593 | input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max); |
| 594 | uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
| 595 | assert((int32_t) input_x_left >= 0); |
| 596 | |
| 597 | float alpha_x = input_x - (float) input_x_left; |
| 598 | if (input_x_left == input_x_max) { |
| 599 | // Ensure that there is a pixel to the right of the one pointed at, |
| 600 | // as required by some CHW kernels. |
| 601 | --input_x_left; |
| 602 | alpha_x = 1.0f; |
| 603 | } |
| 604 | |
| 605 | indirection_buffer[0] = |
| 606 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
| 607 | indirection_buffer[1] = |
| 608 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
| 609 | packed_weights[0] = alpha_x; |
| 610 | packed_weights[1] = alpha_y; |
| 611 | indirection_buffer += 2; |
| 612 | packed_weights += 2; |
| 613 | } |
| 614 | } |
| 615 | } |
| 616 | } |
| 617 | |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 618 | void xnn_indirection_init_unpool2d( |
| 619 | xnn_operator_t op, |
| 620 | size_t batch_start, |
| 621 | uint32_t log2_element_size) |
| 622 | { |
| 623 | const void** indirection_buffer = op->indirection_buffer; |
| 624 | const void* output = op->output; |
| 625 | const size_t output_pixel_stride = op->output_pixel_stride << log2_element_size; |
| 626 | const size_t batch_size = op->batch_size; |
| 627 | const size_t input_height = op->input_height; |
| 628 | const size_t input_width = op->input_width; |
| 629 | const size_t output_height = op->output_height; |
| 630 | const size_t output_width = op->output_width; |
| 631 | const size_t pooling_height = op->kernel_height; |
| 632 | const size_t pooling_width = op->kernel_width; |
| 633 | const size_t output_padding_top = op->padding_top; |
| 634 | const size_t output_padding_left = op->padding_left; |
| 635 | |
| 636 | for (size_t image = batch_start; image < batch_size; image++) { |
| 637 | for (size_t input_y = 0; input_y < input_height; input_y++) { |
| 638 | for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) { |
| 639 | const size_t output_y = min(doz(input_y * pooling_height + pooling_y, output_padding_top), output_height - 1); |
| 640 | for (size_t input_x = 0; input_x < input_width; input_x++) { |
| 641 | for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) { |
| 642 | const size_t output_x = min(doz(input_x * pooling_width + pooling_x, output_padding_left), output_width - 1); |
| 643 | indirection_buffer[(((image * input_height + input_y) * input_width + input_x) * pooling_width + pooling_x) * pooling_height + pooling_y] = |
Marat Dukhan | bdc8099 | 2020-04-13 01:21:18 -0700 | [diff] [blame] | 644 | (const void*) ((uintptr_t) output + ((image * output_height + output_y) * output_width + output_x) * output_pixel_stride); |
XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 645 | } |
| 646 | } |
| 647 | } |
| 648 | } |
| 649 | } |
| 650 | } |