| XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 1 | // Copyright (c) Facebook, Inc. and its affiliates. | 
|  | 2 | // All rights reserved. | 
|  | 3 | // | 
|  | 4 | // Copyright 2019 Google LLC | 
|  | 5 | // | 
|  | 6 | // This source code is licensed under the BSD-style license found in the | 
|  | 7 | // LICENSE file in the root directory of this source tree. | 
|  | 8 |  | 
|  | 9 | #include <stddef.h> | 
| Marat Dukhan | 0ab7553 | 2021-11-24 16:50:30 -0800 | [diff] [blame] | 10 | #include <math.h> | 
| XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 11 |  | 
|  | 12 | #include <fxdiv.h> | 
|  | 13 |  | 
|  | 14 | #include <xnnpack/indirection.h> | 
|  | 15 | #include <xnnpack/operator.h> | 
|  | 16 | #include <xnnpack/math.h> | 
|  | 17 |  | 
|  | 18 |  | 
|  | 19 | void xnn_indirection_init_conv2d( | 
|  | 20 | xnn_operator_t op, | 
|  | 21 | size_t output_tile_size, | 
|  | 22 | uint32_t log2_element_size) | 
|  | 23 | { | 
|  | 24 | const void** indirection_buffer          = op->indirection_buffer; | 
|  | 25 | const void* input                        = op->input; | 
|  | 26 | const void* zero                         = op->zero_buffer; | 
|  | 27 | const size_t input_pixel_stride          = op->input_pixel_stride << log2_element_size; | 
|  | 28 | const size_t input_height                = op->input_height; | 
|  | 29 | const size_t input_width                 = op->input_width; | 
|  | 30 | const size_t output_height               = op->output_height; | 
|  | 31 | const size_t output_width                = op->output_width; | 
|  | 32 | const size_t kernel_height               = op->kernel_height; | 
|  | 33 | const size_t kernel_width                = op->kernel_width; | 
|  | 34 | const size_t stride_height               = op->stride_height; | 
|  | 35 | const size_t stride_width                = op->stride_width; | 
|  | 36 | const size_t dilation_height             = op->dilation_height; | 
|  | 37 | const size_t dilation_width              = op->dilation_width; | 
|  | 38 | const size_t input_padding_top           = op->padding_top; | 
|  | 39 | const size_t input_padding_left          = op->padding_left; | 
|  | 40 |  | 
|  | 41 | const size_t output_size = output_height * output_width; | 
|  | 42 | const size_t tiled_output_size = round_up(output_size, output_tile_size); | 
|  | 43 | const size_t kernel_size = kernel_height * kernel_width; | 
|  | 44 |  | 
|  | 45 | const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width); | 
|  | 46 |  | 
|  | 47 | for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) { | 
|  | 48 | for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) { | 
|  | 49 | const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1); | 
|  | 50 | const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor); | 
|  | 51 | const size_t output_x = output_y_x.remainder; | 
|  | 52 | const size_t output_y = output_y_x.quotient; | 
|  | 53 | for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) { | 
|  | 54 | const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top; | 
|  | 55 | if (input_y < input_height) { | 
|  | 56 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { | 
|  | 57 | const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left; | 
|  | 58 | const size_t kernel_index = kernel_y * kernel_width + kernel_x; | 
|  | 59 | const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset; | 
|  | 60 | if (input_x < input_width) { | 
|  | 61 | indirection_buffer[index] = (const void*) | 
|  | 62 | ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); | 
|  | 63 | } else { | 
|  | 64 | indirection_buffer[index] = zero; | 
|  | 65 | } | 
|  | 66 | } | 
|  | 67 | } else { | 
|  | 68 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { | 
|  | 69 | const size_t kernel_index = kernel_y * kernel_width + kernel_x; | 
|  | 70 | const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset; | 
|  | 71 | indirection_buffer[index] = zero; | 
|  | 72 | } | 
|  | 73 | } | 
|  | 74 | } | 
|  | 75 | } | 
|  | 76 | } | 
|  | 77 | } | 
|  | 78 |  | 
| XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 79 | void xnn_indirection_init_deconv2d( | 
|  | 80 | xnn_operator_t op, | 
|  | 81 | size_t output_tile_size, | 
|  | 82 | uint32_t log2_element_size) | 
|  | 83 | { | 
|  | 84 | const void** indirection_buffer = op->indirection_buffer; | 
|  | 85 | const void* input               = op->input; | 
|  | 86 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; | 
|  | 87 | const void* zero                = op->zero_buffer; | 
|  | 88 | const size_t input_height       = op->input_height; | 
|  | 89 | const size_t input_width        = op->input_width; | 
|  | 90 | const size_t output_height      = op->output_height; | 
|  | 91 | const size_t output_width       = op->output_width; | 
|  | 92 | const size_t kernel_height      = op->kernel_height; | 
|  | 93 | const size_t kernel_width       = op->kernel_width; | 
|  | 94 | const size_t stride_height      = op->stride_height; | 
|  | 95 | const size_t stride_width       = op->stride_width; | 
|  | 96 | const size_t dilation_height    = op->dilation_height; | 
|  | 97 | const size_t dilation_width     = op->dilation_width; | 
|  | 98 | const size_t padding_top        = op->padding_top; | 
|  | 99 | const size_t padding_left       = op->padding_left; | 
|  | 100 |  | 
|  | 101 | const size_t output_size = output_height * output_width; | 
|  | 102 | const size_t tiled_output_size = round_up(output_size, output_tile_size); | 
|  | 103 | const size_t kernel_size = kernel_height * kernel_width; | 
|  | 104 |  | 
|  | 105 | const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width); | 
|  | 106 | const struct fxdiv_divisor_size_t stride_height_divisor = fxdiv_init_size_t(stride_height); | 
|  | 107 | const struct fxdiv_divisor_size_t stride_width_divisor = fxdiv_init_size_t(stride_width); | 
|  | 108 |  | 
|  | 109 | for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) { | 
|  | 110 | for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) { | 
|  | 111 | const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1); | 
|  | 112 | const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor); | 
|  | 113 | const size_t output_x = output_y_x.remainder; | 
|  | 114 | const size_t output_y = output_y_x.quotient; | 
|  | 115 | for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) { | 
|  | 116 | const size_t y = output_y + padding_top - kernel_y * dilation_height; | 
|  | 117 | const size_t input_y = fxdiv_quotient_size_t(y, stride_height_divisor); | 
|  | 118 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { | 
|  | 119 | const size_t x = output_x + padding_left - kernel_x * dilation_width; | 
|  | 120 | const size_t input_x = fxdiv_quotient_size_t(x, stride_width_divisor); | 
|  | 121 | const size_t kernel_index = kernel_y * kernel_width + kernel_x; | 
|  | 122 | const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset; | 
|  | 123 | if (input_y * stride_height == y && input_y < input_height && input_x * stride_width == x && input_x < input_width) { | 
|  | 124 | indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); | 
|  | 125 | } else { | 
|  | 126 | indirection_buffer[index] = zero; | 
|  | 127 | } | 
|  | 128 | } | 
|  | 129 | } | 
|  | 130 | } | 
|  | 131 | } | 
|  | 132 | } | 
|  | 133 |  | 
|  | 134 | void xnn_indirection_init_subconv2d( | 
|  | 135 | xnn_operator_t op, | 
|  | 136 | size_t output_tile_size, | 
|  | 137 | uint32_t log2_element_size) | 
|  | 138 | { | 
|  | 139 | const void** indirection_buffer                     = op->indirection_buffer; | 
|  | 140 | struct subconvolution_params* subconvolution_params = op->subconvolution_buffer; | 
|  | 141 | const void* input                                   = op->input; | 
|  | 142 | const size_t input_pixel_stride                     = op->input_pixel_stride << log2_element_size; | 
|  | 143 | const void* zero                                    = op->zero_buffer; | 
|  | 144 | const size_t input_height                           = op->input_height; | 
|  | 145 | const size_t input_width                            = op->input_width; | 
|  | 146 | const size_t output_height                          = op->output_height; | 
|  | 147 | const size_t output_width                           = op->output_width; | 
|  | 148 | const size_t kernel_height                          = op->kernel_height; | 
|  | 149 | const size_t kernel_width                           = op->kernel_width; | 
|  | 150 | const size_t stride_height                          = op->stride_height; | 
|  | 151 | const size_t stride_width                           = op->stride_width; | 
|  | 152 | const size_t padding_top                            = op->padding_top; | 
|  | 153 | const size_t padding_left                           = op->padding_left; | 
|  | 154 |  | 
|  | 155 | const size_t modulo_padding_top = padding_top % stride_height; | 
|  | 156 | const size_t modulo_padding_left = padding_left % stride_width; | 
|  | 157 | for (size_t offset_y = 0; offset_y < stride_height; offset_y++) { | 
|  | 158 | const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height); | 
|  | 159 | for (size_t offset_x = 0; offset_x < stride_width; offset_x++) { | 
|  | 160 | const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width); | 
|  | 161 | const size_t sliced_output_width = divide_round_up(output_width - output_x_start, stride_width); | 
|  | 162 |  | 
|  | 163 | subconvolution_params->indirection_buffer = indirection_buffer; | 
|  | 164 | subconvolution_params->indirection_y_stride = | 
|  | 165 | subconvolution_params->indirection_x_stride * round_up(sliced_output_width, output_tile_size); | 
|  | 166 | ++subconvolution_params; | 
|  | 167 |  | 
|  | 168 | for (size_t output_y = output_y_start; output_y < output_height; output_y += stride_height) { | 
|  | 169 | for (size_t output_tile_start = 0; output_tile_start < sliced_output_width; output_tile_start += output_tile_size) { | 
|  | 170 | for (size_t kernel_y = offset_y; kernel_y < kernel_height; kernel_y += stride_height) { | 
|  | 171 | assert(doz(output_y + padding_top, kernel_y) % stride_height == 0); | 
|  | 172 | const size_t y = output_y + padding_top - kernel_y; | 
|  | 173 | const size_t input_y = y / stride_height; | 
|  | 174 |  | 
|  | 175 | for (size_t kernel_x = offset_x; kernel_x < kernel_width; kernel_x += stride_width) { | 
|  | 176 | for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) { | 
|  | 177 | const size_t sliced_output_x = min(output_tile_start + output_tile_offset, sliced_output_width - 1); | 
|  | 178 | const size_t output_x = output_x_start + sliced_output_x * stride_width; | 
|  | 179 |  | 
|  | 180 | assert(doz(output_x + padding_left, kernel_x) % stride_width == 0); | 
|  | 181 | const size_t x = output_x + padding_left - kernel_x; | 
|  | 182 | const size_t input_x = x / stride_width; | 
|  | 183 |  | 
|  | 184 | if (input_y < input_height && input_x < input_width) { | 
|  | 185 | *indirection_buffer++ = | 
|  | 186 | (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); | 
|  | 187 | } else { | 
|  | 188 | *indirection_buffer++ = zero; | 
|  | 189 | } | 
|  | 190 | } | 
|  | 191 | } | 
|  | 192 | } | 
|  | 193 | } | 
|  | 194 | } | 
|  | 195 | } | 
|  | 196 | } | 
|  | 197 | } | 
|  | 198 |  | 
| Marat Dukhan | c79427c | 2020-10-15 09:04:21 -0700 | [diff] [blame] | 199 | void xnn_indirection_init_dwconv2d( | 
|  | 200 | xnn_operator_t op, | 
|  | 201 | size_t step_height, | 
|  | 202 | size_t step_width, | 
|  | 203 | uint32_t log2_element_size) | 
|  | 204 | { | 
|  | 205 | const void** indirection_buffer = op->indirection_buffer; | 
|  | 206 | const void* input               = op->input; | 
|  | 207 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; | 
|  | 208 | const void* zero                = op->zero_buffer; | 
|  | 209 | const size_t input_height       = op->input_height; | 
|  | 210 | const size_t input_width        = op->input_width; | 
|  | 211 | const size_t output_height      = op->output_height; | 
|  | 212 | const size_t output_width       = op->output_width; | 
|  | 213 | const size_t kernel_height      = op->kernel_height; | 
|  | 214 | const size_t kernel_width       = op->kernel_width; | 
|  | 215 | const size_t stride_height      = op->stride_height; | 
|  | 216 | const size_t stride_width       = op->stride_width; | 
|  | 217 | const size_t dilation_height    = op->dilation_height; | 
|  | 218 | const size_t dilation_width     = op->dilation_width; | 
|  | 219 | const size_t input_padding_top  = op->padding_top; | 
|  | 220 | const size_t input_padding_left = op->padding_left; | 
|  | 221 |  | 
|  | 222 | for (size_t output_y = 0; output_y < output_height; output_y++) { | 
|  | 223 | for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) { | 
|  | 224 | const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top; | 
|  | 225 | if (input_y < input_height) { | 
|  | 226 | for (size_t output_x = 0; output_x < output_width; output_x++) { | 
|  | 227 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { | 
|  | 228 | const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left; | 
|  | 229 | const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y; | 
|  | 230 | if (input_x < input_width) { | 
|  | 231 | indirection_buffer[index] = | 
|  | 232 | (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); | 
|  | 233 | } else { | 
|  | 234 | indirection_buffer[index] = zero; | 
|  | 235 | } | 
|  | 236 | } | 
|  | 237 | } | 
|  | 238 | } else { | 
|  | 239 | for (size_t output_x = 0; output_x < output_width; output_x++) { | 
|  | 240 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { | 
|  | 241 | const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y; | 
|  | 242 | indirection_buffer[index] = zero; | 
|  | 243 | } | 
|  | 244 | } | 
|  | 245 | } | 
|  | 246 | } | 
|  | 247 | } | 
|  | 248 | } | 
|  | 249 |  | 
| XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 250 | void xnn_indirection_init_maxpool2d( | 
|  | 251 | xnn_operator_t op, | 
| XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 252 | size_t step_height, | 
|  | 253 | size_t step_width, | 
|  | 254 | uint32_t log2_element_size) | 
|  | 255 | { | 
|  | 256 | const void** indirection_buffer = op->indirection_buffer; | 
|  | 257 | const void* input               = op->input; | 
|  | 258 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; | 
| XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 259 | const size_t input_height       = op->input_height; | 
|  | 260 | const size_t input_width        = op->input_width; | 
|  | 261 | const size_t output_height      = op->output_height; | 
|  | 262 | const size_t output_width       = op->output_width; | 
|  | 263 | const size_t pooling_height     = op->kernel_height; | 
|  | 264 | const size_t pooling_width      = op->kernel_width; | 
|  | 265 | const size_t stride_height      = op->stride_height; | 
|  | 266 | const size_t stride_width       = op->stride_width; | 
|  | 267 | const size_t dilation_height    = op->dilation_height; | 
|  | 268 | const size_t dilation_width     = op->dilation_width; | 
|  | 269 | const size_t input_padding_top  = op->padding_top; | 
|  | 270 | const size_t input_padding_left = op->padding_left; | 
|  | 271 |  | 
| Marat Dukhan | c58bd34 | 2020-03-19 18:53:05 -0700 | [diff] [blame] | 272 | const bool any_dilation = (dilation_height | dilation_width) > 1; | 
|  | 273 |  | 
|  | 274 | if (any_dilation) { | 
|  | 275 | // Clamp to the border doesn't work for pooling with dilation. | 
|  | 276 | const size_t adjusted_padding_top = input_padding_top % dilation_height; | 
|  | 277 | const size_t adjusted_padding_left = input_padding_left % dilation_width; | 
|  | 278 | for (size_t output_y = 0; output_y < output_height; output_y++) { | 
|  | 279 | for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) { | 
|  | 280 | size_t safe_input_y = output_y * stride_height; | 
|  | 281 | if XNN_UNPREDICTABLE(safe_input_y < adjusted_padding_top) { | 
|  | 282 | safe_input_y += dilation_height; | 
|  | 283 | } | 
|  | 284 | safe_input_y -= adjusted_padding_top; | 
|  | 285 |  | 
|  | 286 | size_t input_y = output_y * stride_height + pooling_y * dilation_height - input_padding_top; | 
|  | 287 | if XNN_UNPREDICTABLE(input_y >= input_height) { | 
|  | 288 | input_y = safe_input_y; | 
|  | 289 | } | 
|  | 290 |  | 
|  | 291 | for (size_t output_x = 0; output_x < output_width; output_x++) { | 
|  | 292 | for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) { | 
|  | 293 | size_t safe_input_x = output_x * stride_width; | 
|  | 294 | if XNN_UNPREDICTABLE(safe_input_x < adjusted_padding_left) { | 
|  | 295 | safe_input_x += dilation_width; | 
|  | 296 | } | 
|  | 297 | safe_input_x -= adjusted_padding_left; | 
|  | 298 |  | 
|  | 299 | size_t input_x = output_x * stride_width + pooling_x * dilation_width - input_padding_left; | 
|  | 300 | if XNN_UNPREDICTABLE(input_x >= input_width) { | 
|  | 301 | input_x = safe_input_x; | 
|  | 302 | } | 
|  | 303 |  | 
|  | 304 | const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y; | 
| Marat Dukhan | bdc8099 | 2020-04-13 01:21:18 -0700 | [diff] [blame] | 305 | indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); | 
| Marat Dukhan | c58bd34 | 2020-03-19 18:53:05 -0700 | [diff] [blame] | 306 | } | 
|  | 307 | } | 
|  | 308 | } | 
|  | 309 | } | 
|  | 310 | } else { | 
|  | 311 | const size_t input_x_max = input_width - 1; | 
|  | 312 | const size_t input_y_max = input_height - 1; | 
|  | 313 | for (size_t output_y = 0; output_y < output_height; output_y++) { | 
|  | 314 | for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) { | 
|  | 315 | const size_t input_y = min(doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top), input_y_max); | 
|  | 316 | for (size_t output_x = 0; output_x < output_width; output_x++) { | 
|  | 317 | for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) { | 
|  | 318 | const size_t input_x = min(doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left), input_x_max); | 
|  | 319 | const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y; | 
| Marat Dukhan | bdc8099 | 2020-04-13 01:21:18 -0700 | [diff] [blame] | 320 | indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); | 
| Marat Dukhan | c58bd34 | 2020-03-19 18:53:05 -0700 | [diff] [blame] | 321 | } | 
| XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 322 | } | 
|  | 323 | } | 
|  | 324 | } | 
|  | 325 | } | 
|  | 326 | } | 
|  | 327 |  | 
| Artsiom Ablavatski | 9791810 | 2020-10-27 15:52:59 -0700 | [diff] [blame] | 328 | void xnn_indirection_init_resize_bilinear2d_hwc_f32( | 
| Marat Dukhan | 6972249 | 2019-11-11 19:55:50 -0800 | [diff] [blame] | 329 | size_t input_pixel_stride, | 
|  | 330 | size_t input_height, | 
|  | 331 | size_t input_width, | 
|  | 332 | size_t output_height, | 
|  | 333 | size_t output_width, | 
|  | 334 | const void* input, | 
|  | 335 | const void** indirection_buffer, | 
|  | 336 | float* packed_weights, | 
|  | 337 | bool align_corners, | 
|  | 338 | bool tensorflow_legacy) | 
|  | 339 | { | 
|  | 340 | assert(input_height != 0); | 
|  | 341 | assert(input_height < 16777216 /* 2**24 */); | 
|  | 342 | assert(input_width != 0); | 
|  | 343 | assert(input_width < 16777216 /* 2**24 */); | 
|  | 344 | assert(output_height != 0); | 
|  | 345 | assert(output_height < 16777216 /* 2**24 */); | 
|  | 346 | assert(output_width != 0); | 
|  | 347 | assert(output_width < 16777216 /* 2**24 */); | 
|  | 348 |  | 
|  | 349 | const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1); | 
|  | 350 | const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1); | 
|  | 351 | const float width_scale = | 
|  | 352 | (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment); | 
|  | 353 | const float height_scale = | 
|  | 354 | (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment); | 
|  | 355 |  | 
|  | 356 | const uint32_t input_y_max = (uint32_t) input_height - 1; | 
|  | 357 | const uint32_t input_x_max = (uint32_t) input_width - 1; | 
| Marat Dukhan | f5c4625 | 2020-05-22 10:36:13 -0700 | [diff] [blame] | 358 | if (tensorflow_legacy || align_corners) { | 
| Marat Dukhan | 6972249 | 2019-11-11 19:55:50 -0800 | [diff] [blame] | 359 | for (size_t output_y = 0; output_y < output_height; output_y++) { | 
|  | 360 | const float input_y = (float) (int32_t) output_y * height_scale; | 
|  | 361 | assert(input_y >= 0.0f); | 
|  | 362 | assert(input_y < (float) input_height); | 
|  | 363 |  | 
|  | 364 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; | 
|  | 365 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); | 
|  | 366 | const float alpha_y = input_y - (float) input_y_top; | 
|  | 367 | for (size_t output_x = 0; output_x < output_width; output_x++) { | 
|  | 368 | const float input_x = (float) (int32_t) output_x * width_scale; | 
|  | 369 | assert(input_x >= 0.0f); | 
|  | 370 | assert(input_x < (float) input_width); | 
|  | 371 |  | 
|  | 372 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; | 
|  | 373 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); | 
|  | 374 | const float alpha_x = input_x - (float) input_x_left; | 
|  | 375 | indirection_buffer[0] = | 
|  | 376 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); | 
|  | 377 | indirection_buffer[1] = | 
|  | 378 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); | 
|  | 379 | indirection_buffer[2] = | 
|  | 380 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); | 
|  | 381 | indirection_buffer[3] = | 
|  | 382 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); | 
|  | 383 | packed_weights[0] = alpha_x; | 
|  | 384 | packed_weights[1] = alpha_y; | 
|  | 385 | indirection_buffer += 4; | 
|  | 386 | packed_weights += 2; | 
|  | 387 | } | 
|  | 388 | } | 
|  | 389 | } else { | 
|  | 390 | const float height_offset = 0.5f * height_scale - 0.5f; | 
|  | 391 | const float width_offset = 0.5f * width_scale - 0.5f; | 
|  | 392 | for (size_t output_y = 0; output_y < output_height; output_y++) { | 
|  | 393 | float input_y = (float) (int32_t) output_y * height_scale + height_offset; | 
|  | 394 | input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max); | 
|  | 395 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; | 
|  | 396 | assert((int32_t) input_y_top >= 0); | 
|  | 397 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); | 
|  | 398 | const float alpha_y = input_y - (float) input_y_top; | 
|  | 399 | for (size_t output_x = 0; output_x < output_width; output_x++) { | 
|  | 400 | float input_x = (float) (int32_t) output_x * width_scale + width_offset; | 
|  | 401 | input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max); | 
|  | 402 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; | 
|  | 403 | assert((int32_t) input_x_left >= 0); | 
|  | 404 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); | 
|  | 405 | const float alpha_x = input_x - (float) input_x_left; | 
|  | 406 | indirection_buffer[0] = | 
|  | 407 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); | 
|  | 408 | indirection_buffer[1] = | 
|  | 409 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); | 
|  | 410 | indirection_buffer[2] = | 
|  | 411 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); | 
|  | 412 | indirection_buffer[3] = | 
|  | 413 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); | 
|  | 414 | packed_weights[0] = alpha_x; | 
|  | 415 | packed_weights[1] = alpha_y; | 
|  | 416 | indirection_buffer += 4; | 
|  | 417 | packed_weights += 2; | 
|  | 418 | } | 
|  | 419 | } | 
|  | 420 | } | 
|  | 421 | } | 
|  | 422 |  | 
| Marat Dukhan | 0ab7553 | 2021-11-24 16:50:30 -0800 | [diff] [blame] | 423 | void xnn_indirection_init_resize_bilinear2d_hwc_q11( | 
|  | 424 | size_t input_pixel_stride, | 
|  | 425 | size_t input_height, | 
|  | 426 | size_t input_width, | 
|  | 427 | size_t output_height, | 
|  | 428 | size_t output_width, | 
|  | 429 | const void* input, | 
|  | 430 | const void** indirection_buffer, | 
|  | 431 | int16_t* packed_weights, | 
|  | 432 | bool align_corners, | 
|  | 433 | bool tensorflow_legacy) | 
|  | 434 | { | 
|  | 435 | assert(input_height != 0); | 
|  | 436 | assert(input_height < 16777216 /* 2**24 */); | 
|  | 437 | assert(input_width != 0); | 
|  | 438 | assert(input_width < 16777216 /* 2**24 */); | 
|  | 439 | assert(output_height != 0); | 
|  | 440 | assert(output_height < 16777216 /* 2**24 */); | 
|  | 441 | assert(output_width != 0); | 
|  | 442 | assert(output_width < 16777216 /* 2**24 */); | 
|  | 443 |  | 
|  | 444 | const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1); | 
|  | 445 | const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1); | 
|  | 446 | const float width_scale = | 
|  | 447 | (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment); | 
|  | 448 | const float height_scale = | 
|  | 449 | (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment); | 
|  | 450 |  | 
|  | 451 | const uint32_t input_y_max = (uint32_t) input_height - 1; | 
|  | 452 | const uint32_t input_x_max = (uint32_t) input_width - 1; | 
|  | 453 | if (tensorflow_legacy || align_corners) { | 
|  | 454 | for (size_t output_y = 0; output_y < output_height; output_y++) { | 
|  | 455 | const float input_y = (float) (int32_t) output_y * height_scale; | 
|  | 456 | assert(input_y >= 0.0f); | 
|  | 457 | assert(input_y < (float) input_height); | 
|  | 458 |  | 
|  | 459 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; | 
|  | 460 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); | 
|  | 461 | const float alpha_y = input_y - (float) input_y_top; | 
|  | 462 | for (size_t output_x = 0; output_x < output_width; output_x++) { | 
|  | 463 | const float input_x = (float) (int32_t) output_x * width_scale; | 
|  | 464 | assert(input_x >= 0.0f); | 
|  | 465 | assert(input_x < (float) input_width); | 
|  | 466 |  | 
|  | 467 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; | 
|  | 468 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); | 
|  | 469 | const float alpha_x = input_x - (float) input_x_left; | 
|  | 470 | indirection_buffer[0] = | 
|  | 471 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); | 
|  | 472 | indirection_buffer[1] = | 
|  | 473 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); | 
|  | 474 | indirection_buffer[2] = | 
|  | 475 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); | 
|  | 476 | indirection_buffer[3] = | 
|  | 477 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); | 
|  | 478 | packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f); | 
|  | 479 | packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f); | 
|  | 480 | indirection_buffer += 4; | 
|  | 481 | packed_weights += 2; | 
|  | 482 | } | 
|  | 483 | } | 
|  | 484 | } else { | 
|  | 485 | const float height_offset = 0.5f * height_scale - 0.5f; | 
|  | 486 | const float width_offset = 0.5f * width_scale - 0.5f; | 
|  | 487 | for (size_t output_y = 0; output_y < output_height; output_y++) { | 
|  | 488 | float input_y = (float) (int32_t) output_y * height_scale + height_offset; | 
|  | 489 | input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max); | 
|  | 490 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; | 
|  | 491 | assert((int32_t) input_y_top >= 0); | 
|  | 492 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); | 
|  | 493 | const float alpha_y = input_y - (float) input_y_top; | 
|  | 494 | for (size_t output_x = 0; output_x < output_width; output_x++) { | 
|  | 495 | float input_x = (float) (int32_t) output_x * width_scale + width_offset; | 
|  | 496 | input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max); | 
|  | 497 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; | 
|  | 498 | assert((int32_t) input_x_left >= 0); | 
|  | 499 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); | 
|  | 500 | const float alpha_x = input_x - (float) input_x_left; | 
|  | 501 | indirection_buffer[0] = | 
|  | 502 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); | 
|  | 503 | indirection_buffer[1] = | 
|  | 504 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); | 
|  | 505 | indirection_buffer[2] = | 
|  | 506 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); | 
|  | 507 | indirection_buffer[3] = | 
|  | 508 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); | 
|  | 509 | packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f); | 
|  | 510 | packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f); | 
|  | 511 | indirection_buffer += 4; | 
|  | 512 | packed_weights += 2; | 
|  | 513 | } | 
|  | 514 | } | 
|  | 515 | } | 
|  | 516 | } | 
|  | 517 |  | 
| Artsiom Ablavatski | 9791810 | 2020-10-27 15:52:59 -0700 | [diff] [blame] | 518 | void xnn_indirection_init_resize_bilinear2d_chw_f32( | 
|  | 519 | size_t input_pixel_stride, | 
|  | 520 | size_t input_height, | 
|  | 521 | size_t input_width, | 
|  | 522 | size_t output_height, | 
|  | 523 | size_t output_width, | 
|  | 524 | const void* input, | 
|  | 525 | const void** indirection_buffer, | 
|  | 526 | float* packed_weights, | 
|  | 527 | bool align_corners, | 
|  | 528 | bool tensorflow_legacy) | 
|  | 529 | { | 
|  | 530 | assert(input_height > 1); | 
|  | 531 | assert(input_height < 16777216 /* 2**24 */); | 
|  | 532 | assert(input_width > 1); | 
|  | 533 | assert(input_width < 16777216 /* 2**24 */); | 
|  | 534 | assert(output_height != 0); | 
|  | 535 | assert(output_height < 16777216 /* 2**24 */); | 
|  | 536 | assert(output_width != 0); | 
|  | 537 | assert(output_width < 16777216 /* 2**24 */); | 
|  | 538 |  | 
|  | 539 | const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1); | 
|  | 540 | const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1); | 
|  | 541 | const float width_scale = | 
|  | 542 | (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment); | 
|  | 543 | const float height_scale = | 
|  | 544 | (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment); | 
|  | 545 |  | 
|  | 546 | const uint32_t input_y_max = (uint32_t) input_height - 1; | 
|  | 547 | const uint32_t input_x_max = (uint32_t) input_width - 1; | 
|  | 548 | if (tensorflow_legacy || align_corners) { | 
|  | 549 | for (size_t output_y = 0; output_y < output_height; output_y++) { | 
|  | 550 | const float input_y = (float) (int32_t) output_y * height_scale; | 
|  | 551 | assert(input_y >= 0.0f); | 
|  | 552 | assert(input_y < (float) input_height); | 
|  | 553 |  | 
|  | 554 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; | 
|  | 555 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); | 
|  | 556 | const float alpha_y = input_y - (float) input_y_top; | 
|  | 557 | for (size_t output_x = 0; output_x < output_width; output_x++) { | 
|  | 558 | const float input_x = (float) (int32_t) output_x * width_scale; | 
|  | 559 | assert(input_x >= 0.0f); | 
|  | 560 | assert(input_x < (float) input_width); | 
|  | 561 |  | 
|  | 562 | uint32_t input_x_left = (uint32_t) (int32_t) input_x; | 
|  | 563 |  | 
|  | 564 | float alpha_x = input_x - (float) input_x_left; | 
|  | 565 | if (input_x_left == input_x_max) { | 
|  | 566 | // Ensure that there is a pixel to the right of the one pointed at, | 
|  | 567 | // as required by some CHW kernels. | 
|  | 568 | --input_x_left; | 
|  | 569 | alpha_x = 1.0f; | 
|  | 570 | } | 
|  | 571 | indirection_buffer[0] = | 
|  | 572 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); | 
|  | 573 | indirection_buffer[1] = | 
|  | 574 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); | 
|  | 575 | packed_weights[0] = alpha_x; | 
|  | 576 | packed_weights[1] = alpha_y; | 
|  | 577 | indirection_buffer += 2; | 
|  | 578 | packed_weights += 2; | 
|  | 579 | } | 
|  | 580 | } | 
|  | 581 | } else { | 
|  | 582 | const float height_offset = 0.5f * height_scale - 0.5f; | 
|  | 583 | const float width_offset = 0.5f * width_scale - 0.5f; | 
|  | 584 | for (size_t output_y = 0; output_y < output_height; output_y++) { | 
|  | 585 | float input_y = (float) (int32_t) output_y * height_scale + height_offset; | 
|  | 586 | input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max); | 
|  | 587 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; | 
|  | 588 | assert((int32_t) input_y_top >= 0); | 
|  | 589 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); | 
|  | 590 | const float alpha_y = input_y - (float) input_y_top; | 
|  | 591 | for (size_t output_x = 0; output_x < output_width; output_x++) { | 
|  | 592 | float input_x = (float) (int32_t) output_x * width_scale + width_offset; | 
|  | 593 | input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max); | 
|  | 594 | uint32_t input_x_left = (uint32_t) (int32_t) input_x; | 
|  | 595 | assert((int32_t) input_x_left >= 0); | 
|  | 596 |  | 
|  | 597 | float alpha_x = input_x - (float) input_x_left; | 
|  | 598 | if (input_x_left == input_x_max) { | 
|  | 599 | // Ensure that there is a pixel to the right of the one pointed at, | 
|  | 600 | // as required by some CHW kernels. | 
|  | 601 | --input_x_left; | 
|  | 602 | alpha_x = 1.0f; | 
|  | 603 | } | 
|  | 604 |  | 
|  | 605 | indirection_buffer[0] = | 
|  | 606 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); | 
|  | 607 | indirection_buffer[1] = | 
|  | 608 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); | 
|  | 609 | packed_weights[0] = alpha_x; | 
|  | 610 | packed_weights[1] = alpha_y; | 
|  | 611 | indirection_buffer += 2; | 
|  | 612 | packed_weights += 2; | 
|  | 613 | } | 
|  | 614 | } | 
|  | 615 | } | 
|  | 616 | } | 
|  | 617 |  | 
| XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 618 | void xnn_indirection_init_unpool2d( | 
|  | 619 | xnn_operator_t op, | 
|  | 620 | size_t batch_start, | 
|  | 621 | uint32_t log2_element_size) | 
|  | 622 | { | 
|  | 623 | const void** indirection_buffer  = op->indirection_buffer; | 
|  | 624 | const void* output               = op->output; | 
|  | 625 | const size_t output_pixel_stride = op->output_pixel_stride << log2_element_size; | 
|  | 626 | const size_t batch_size          = op->batch_size; | 
|  | 627 | const size_t input_height        = op->input_height; | 
|  | 628 | const size_t input_width         = op->input_width; | 
|  | 629 | const size_t output_height       = op->output_height; | 
|  | 630 | const size_t output_width        = op->output_width; | 
|  | 631 | const size_t pooling_height      = op->kernel_height; | 
|  | 632 | const size_t pooling_width       = op->kernel_width; | 
|  | 633 | const size_t output_padding_top  = op->padding_top; | 
|  | 634 | const size_t output_padding_left = op->padding_left; | 
|  | 635 |  | 
|  | 636 | for (size_t image = batch_start; image < batch_size; image++) { | 
|  | 637 | for (size_t input_y = 0; input_y < input_height; input_y++) { | 
|  | 638 | for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) { | 
|  | 639 | const size_t output_y = min(doz(input_y * pooling_height + pooling_y, output_padding_top), output_height - 1); | 
|  | 640 | for (size_t input_x = 0; input_x < input_width; input_x++) { | 
|  | 641 | for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) { | 
|  | 642 | const size_t output_x = min(doz(input_x * pooling_width + pooling_x, output_padding_left), output_width - 1); | 
|  | 643 | indirection_buffer[(((image * input_height + input_y) * input_width + input_x) * pooling_width + pooling_x) * pooling_height + pooling_y] = | 
| Marat Dukhan | bdc8099 | 2020-04-13 01:21:18 -0700 | [diff] [blame] | 644 | (const void*) ((uintptr_t) output + ((image * output_height + output_y) * output_width + output_x) * output_pixel_stride); | 
| XNNPACK Team | b455b12 | 2019-09-27 18:10:33 -0700 | [diff] [blame] | 645 | } | 
|  | 646 | } | 
|  | 647 | } | 
|  | 648 | } | 
|  | 649 | } | 
|  | 650 | } |