S8 & U8 Resize Bilinear NHWC operators

PiperOrigin-RevId: 412158448
diff --git a/src/indirection.c b/src/indirection.c
index 14b8aa9..872d4cc 100644
--- a/src/indirection.c
+++ b/src/indirection.c
@@ -7,6 +7,7 @@
 // LICENSE file in the root directory of this source tree.
 
 #include <stddef.h>
+#include <math.h>
 
 #include <fxdiv.h>
 
@@ -419,6 +420,101 @@
   }
 }
 
+void xnn_indirection_init_resize_bilinear2d_hwc_q11(
+  size_t input_pixel_stride,
+  size_t input_height,
+  size_t input_width,
+  size_t output_height,
+  size_t output_width,
+  const void* input,
+  const void** indirection_buffer,
+  int16_t* packed_weights,
+  bool align_corners,
+  bool tensorflow_legacy)
+{
+  assert(input_height != 0);
+  assert(input_height < 16777216 /* 2**24 */);
+  assert(input_width != 0);
+  assert(input_width < 16777216 /* 2**24 */);
+  assert(output_height != 0);
+  assert(output_height < 16777216 /* 2**24 */);
+  assert(output_width != 0);
+  assert(output_width < 16777216 /* 2**24 */);
+
+  const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
+  const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
+  const float width_scale =
+    (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
+  const float height_scale =
+    (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
+
+  const uint32_t input_y_max = (uint32_t) input_height - 1;
+  const uint32_t input_x_max = (uint32_t) input_width - 1;
+  if (tensorflow_legacy || align_corners) {
+    for (size_t output_y = 0; output_y < output_height; output_y++) {
+      const float input_y = (float) (int32_t) output_y * height_scale;
+      assert(input_y >= 0.0f);
+      assert(input_y < (float) input_height);
+
+      const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
+      const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
+      const float alpha_y = input_y - (float) input_y_top;
+      for (size_t output_x = 0; output_x < output_width; output_x++) {
+        const float input_x = (float) (int32_t) output_x * width_scale;
+        assert(input_x >= 0.0f);
+        assert(input_x < (float) input_width);
+
+        const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
+        const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
+        const float alpha_x = input_x - (float) input_x_left;
+        indirection_buffer[0] =
+          (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
+        indirection_buffer[1] =
+          (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
+        indirection_buffer[2] =
+          (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
+        indirection_buffer[3] =
+          (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
+        packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f);
+        packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f);
+        indirection_buffer += 4;
+        packed_weights += 2;
+      }
+    }
+  } else {
+    const float height_offset = 0.5f * height_scale - 0.5f;
+    const float width_offset = 0.5f * width_scale - 0.5f;
+    for (size_t output_y = 0; output_y < output_height; output_y++) {
+      float input_y = (float) (int32_t) output_y * height_scale + height_offset;
+      input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
+      const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
+      assert((int32_t) input_y_top >= 0);
+      const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
+      const float alpha_y = input_y - (float) input_y_top;
+      for (size_t output_x = 0; output_x < output_width; output_x++) {
+        float input_x = (float) (int32_t) output_x * width_scale + width_offset;
+        input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
+        const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
+        assert((int32_t) input_x_left >= 0);
+        const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
+        const float alpha_x = input_x - (float) input_x_left;
+        indirection_buffer[0] =
+          (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
+        indirection_buffer[1] =
+          (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
+        indirection_buffer[2] =
+          (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
+        indirection_buffer[3] =
+          (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
+        packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f);
+        packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f);
+        indirection_buffer += 4;
+        packed_weights += 2;
+      }
+    }
+  }
+}
+
 void xnn_indirection_init_resize_bilinear2d_chw_f32(
   size_t input_pixel_stride,
   size_t input_height,
diff --git a/src/operator-strings.c b/src/operator-strings.c
index 3db439f..5e4cffb 100644
--- a/src/operator-strings.c
+++ b/src/operator-strings.c
@@ -138,6 +138,10 @@
       return "PReLU (NC, F32)";
     case xnn_operator_type_resize_bilinear_nhwc_f32:
       return "Resize Bilinear (NHWC, F32)";
+    case xnn_operator_type_resize_bilinear_nhwc_s8:
+      return "Resize Bilinear (NHWC, S8)";
+    case xnn_operator_type_resize_bilinear_nhwc_u8:
+      return "Resize Bilinear (NHWC, U8)";
     case xnn_operator_type_resize_bilinear_nchw_f32:
       return "Resize Bilinear (NCHW, F32)";
     case xnn_operator_type_sigmoid_nc_f32:
diff --git a/src/operators/resize-bilinear-nhwc.c b/src/operators/resize-bilinear-nhwc.c
index 9a898dd..ac1836d 100644
--- a/src/operators/resize-bilinear-nhwc.c
+++ b/src/operators/resize-bilinear-nhwc.c
@@ -21,11 +21,12 @@
 #include <xnnpack/indirection.h>
 
 
-enum xnn_status xnn_create_resize_bilinear2d_nhwc_f32(
+static enum xnn_status create_resize_bilinear2d_nhwc(
     size_t channels,
     size_t input_pixel_stride,
     size_t output_pixel_stride,
     uint32_t flags,
+    enum xnn_operator_type operator_type,
     xnn_operator_t* resize_op_out)
 {
   xnn_operator_t resize_op = NULL;
@@ -33,7 +34,7 @@
 
   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
     xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32));
+      xnn_operator_type_to_string(operator_type));
     goto error;
   }
 
@@ -42,7 +43,7 @@
   if (channels == 0) {
     xnn_log_error(
       "failed to create %s operator with %zu channels: number of channels must be non-zero",
-      xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32), channels);
+      xnn_operator_type_to_string(operator_type), channels);
     goto error;
   }
 
@@ -50,7 +51,7 @@
     xnn_log_error(
       "failed to create %s operator with input pixel stride of %zu: "
       "stride must be at least as large as the number of channels (%zu)",
-      xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32), input_pixel_stride, channels);
+      xnn_operator_type_to_string(operator_type), input_pixel_stride, channels);
     goto error;
   }
 
@@ -58,7 +59,7 @@
     xnn_log_error(
       "failed to create %s operator with output pixel stride of %zu: "
       "stride must be at least as large as the number of channels (%zu)",
-      xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32), output_pixel_stride, channels);
+      xnn_operator_type_to_string(operator_type), output_pixel_stride, channels);
     goto error;
   }
 
@@ -68,7 +69,7 @@
   if (resize_op == NULL) {
     xnn_log_error(
       "failed to allocate %zu bytes for %s operator descriptor",
-      sizeof(struct xnn_operator), xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32));
+      sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
     goto error;
   }
 
@@ -76,7 +77,7 @@
   resize_op->input_pixel_stride = input_pixel_stride;
   resize_op->output_pixel_stride = output_pixel_stride;
 
-  resize_op->type = xnn_operator_type_resize_bilinear_nhwc_f32;
+  resize_op->type = operator_type;
   resize_op->flags = flags;
 
   resize_op->state = xnn_run_state_invalid;
@@ -89,20 +90,73 @@
   return status;
 }
 
-enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f32(
+enum xnn_status xnn_create_resize_bilinear2d_nhwc_f32(
+    size_t channels,
+    size_t input_pixel_stride,
+    size_t output_pixel_stride,
+    uint32_t flags,
+    xnn_operator_t* resize_op_out)
+{
+  return create_resize_bilinear2d_nhwc(
+    channels,
+    input_pixel_stride,
+    output_pixel_stride,
+    flags,
+    xnn_operator_type_resize_bilinear_nhwc_f32,
+    resize_op_out);
+}
+
+enum xnn_status xnn_create_resize_bilinear2d_nhwc_s8(
+    size_t channels,
+    size_t input_pixel_stride,
+    size_t output_pixel_stride,
+    uint32_t flags,
+    xnn_operator_t* resize_op_out)
+{
+  return create_resize_bilinear2d_nhwc(
+    channels,
+    input_pixel_stride,
+    output_pixel_stride,
+    flags,
+    xnn_operator_type_resize_bilinear_nhwc_s8,
+    resize_op_out);
+}
+
+enum xnn_status xnn_create_resize_bilinear2d_nhwc_u8(
+    size_t channels,
+    size_t input_pixel_stride,
+    size_t output_pixel_stride,
+    uint32_t flags,
+    xnn_operator_t* resize_op_out)
+{
+  return create_resize_bilinear2d_nhwc(
+    channels,
+    input_pixel_stride,
+    output_pixel_stride,
+    flags,
+    xnn_operator_type_resize_bilinear_nhwc_u8,
+    resize_op_out);
+}
+
+static enum xnn_status setup_resize_bilinear2d_nhwc(
     xnn_operator_t resize_op,
+    enum xnn_operator_type expected_operator_type,
     size_t batch_size,
     size_t input_height,
     size_t input_width,
     size_t output_height,
     size_t output_width,
-    const float* input,
-    float* output,
-    pthreadpool_t threadpool)
+    const void* input,
+    void* output,
+    uint32_t log2_element_size,
+    uint32_t log2_weight_element_size,
+    xnn_indirection_init_resize_bilinear2d_hwc_fn indirection_init,
+    const struct ibilinear_parameters ibilinear[restrict XNN_MIN_ELEMENTS(1)],
+    size_t num_threads)
 {
-  if (resize_op->type != xnn_operator_type_resize_bilinear_nhwc_f32) {
+  if (resize_op->type != expected_operator_type) {
     xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
-      xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32),
+      xnn_operator_type_to_string(expected_operator_type),
       xnn_operator_type_to_string(resize_op->type));
     return xnn_status_invalid_parameter;
   }
@@ -110,35 +164,35 @@
 
   if ((xnn_params.init_flags & XNN_INIT_FLAG_XNNPACK) == 0) {
     xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32));
+      xnn_operator_type_to_string(resize_op->type));
     return xnn_status_uninitialized;
   }
 
   if (input_width == 0 || input_height == 0) {
     xnn_log_error(
       "failed to setup %s operator with %zux%zu input: input dimensions must be non-zero",
-      xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32), input_width, input_height);
+      xnn_operator_type_to_string(resize_op->type), input_width, input_height);
     return xnn_status_invalid_parameter;
   }
 
   if (max(input_width, input_height) >= 16777216) {
     xnn_log_error(
       "failed to setup %s operator with %zux%zu input: input dimensions must be below 2**24",
-      xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32), input_width, input_height);
+      xnn_operator_type_to_string(resize_op->type), input_width, input_height);
     return xnn_status_unsupported_parameter;
   }
 
   if (output_width == 0 || output_height == 0) {
     xnn_log_error(
       "failed to setup %s operator with %zux%zu output: output dimensions must be non-zero",
-      xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32), output_width, output_height);
+      xnn_operator_type_to_string(resize_op->type), output_width, output_height);
     return xnn_status_invalid_parameter;
   }
 
   if (max(output_width, output_height) >= 16777216) {
     xnn_log_error(
       "failed to setup %s operator with %zux%zu output: output dimensions must be below 2**24",
-      xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32), output_width, output_height);
+      xnn_operator_type_to_string(resize_op->type), output_width, output_height);
     return xnn_status_unsupported_parameter;
   }
 
@@ -149,13 +203,13 @@
 
   if (output_height * output_width != resize_op->last_output_height * resize_op->last_output_width) {
     const size_t indirection_buffer_size = sizeof(void*) * (output_height * output_width * 4);
-    const size_t packed_weights_size = sizeof(float) * (output_height * output_width * 2);
+    const size_t packed_weights_size = (output_height * output_width * 2) << log2_weight_element_size;
 
     const void** indirection_buffer = (const void**) xnn_reallocate_memory(resize_op->indirection_buffer, indirection_buffer_size);
     if (indirection_buffer == NULL) {
       xnn_log_error(
         "failed to allocate %zu bytes for %s operator indirection buffer",
-        indirection_buffer_size, xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32));
+        indirection_buffer_size, xnn_operator_type_to_string(resize_op->type));
       return xnn_status_out_of_memory;
     }
     resize_op->indirection_buffer = indirection_buffer;
@@ -166,19 +220,19 @@
     if (resize_op->packed_weights == NULL) {
       xnn_log_error(
         "failed to allocate %zu bytes for %s operator packed weights",
-        packed_weights_size, xnn_operator_type_to_string(xnn_operator_type_resize_bilinear_nhwc_f32));
+        packed_weights_size, xnn_operator_type_to_string(resize_op->type));
       return xnn_status_out_of_memory;
     }
   }
 
-  const size_t input_pixel_stride_in_bytes = resize_op->input_pixel_stride * sizeof(float);
+  const size_t input_pixel_stride_in_bytes = resize_op->input_pixel_stride << log2_element_size;
   if (input_height != resize_op->last_input_height ||
       input_width != resize_op->last_input_width ||
       output_height != resize_op->last_output_height ||
       output_width != resize_op->last_output_width)
   {
     const uint32_t flags = resize_op->flags;
-    xnn_indirection_init_resize_bilinear2d_hwc_f32(
+    indirection_init(
       input_pixel_stride_in_bytes,
       input_height, input_width,
       output_height, output_width,
@@ -193,9 +247,9 @@
     resize_op->last_output_width = output_width;
   }
 
-  const size_t output_pixel_stride_in_bytes = resize_op->output_pixel_stride * sizeof(float);
+  const size_t output_pixel_stride_in_bytes = resize_op->output_pixel_stride << log2_element_size;
   resize_op->context.resize_bilinear = (struct resize_bilinear_context) {
-    .scaled_channels = resize_op->channels * sizeof(float),
+    .scaled_channels = resize_op->channels << log2_element_size,
     .indirect_input = resize_op->indirection_buffer,
     .input_offset = (size_t) ((uintptr_t) input - (uintptr_t) resize_op->last_input),
     .input_batch_stride = input_pixel_stride_in_bytes * input_height * input_width,
@@ -203,18 +257,17 @@
     .output = output,
     .output_pixel_stride = output_pixel_stride_in_bytes,
     .output_batch_stride = output_pixel_stride_in_bytes * output_height * output_width,
-    .log2_wsize = 3 /* log2(2 * sizeof(float)) */,
-    .ukernel = xnn_params.f32.ibilinear.ukernel,
+    .log2_wsize = 1 + log2_weight_element_size /* log2(2 * sizeof(weight)) */,
+    .ukernel = ibilinear->ukernel,
   };
 
   const size_t output_size = output_height * output_width;
   size_t output_size_tile = output_size;
-  const size_t num_threads = pthreadpool_get_threads_count(threadpool);
   if (num_threads > 1) {
     const size_t target_tiles_per_thread = 5;
     const size_t max_output_size_tile = divide_round_up(output_size, num_threads * target_tiles_per_thread);
     if (max_output_size_tile < output_size_tile) {
-      const uint32_t output_size_subtile = xnn_params.f32.ibilinear.pixel_tile;
+      const uint32_t output_size_subtile = ibilinear->pixel_tile;
       output_size_tile =
         min(output_size_tile,
           divide_round_up(output_size_tile, max_output_size_tile * output_size_subtile) * output_size_subtile);
@@ -229,3 +282,87 @@
 
   return xnn_status_success;
 }
+
+enum xnn_status xnn_setup_resize_bilinear2d_nhwc_f32(
+    xnn_operator_t resize_op,
+    size_t batch_size,
+    size_t input_height,
+    size_t input_width,
+    size_t output_height,
+    size_t output_width,
+    const float* input,
+    float* output,
+    pthreadpool_t threadpool)
+{
+  return setup_resize_bilinear2d_nhwc(
+    resize_op,
+    xnn_operator_type_resize_bilinear_nhwc_f32,
+    batch_size,
+    input_height,
+    input_width,
+    output_height,
+    output_width,
+    input,
+    output,
+    2 /* log2(element size) == log2(sizeof(float)) */,
+    2 /* log2(weight element size) == log2(sizeof(float)) */,
+    (xnn_indirection_init_resize_bilinear2d_hwc_fn) xnn_indirection_init_resize_bilinear2d_hwc_f32,
+    &xnn_params.f32.ibilinear,
+    pthreadpool_get_threads_count(threadpool));
+}
+
+enum xnn_status xnn_setup_resize_bilinear2d_nhwc_s8(
+    xnn_operator_t resize_op,
+    size_t batch_size,
+    size_t input_height,
+    size_t input_width,
+    size_t output_height,
+    size_t output_width,
+    const int8_t* input,
+    int8_t* output,
+    pthreadpool_t threadpool)
+{
+  return setup_resize_bilinear2d_nhwc(
+    resize_op,
+    xnn_operator_type_resize_bilinear_nhwc_s8,
+    batch_size,
+    input_height,
+    input_width,
+    output_height,
+    output_width,
+    input,
+    output,
+    0 /* log2(element size) == log2(sizeof(int8_t)) */,
+    1 /* log2(weight element size) == log2(sizeof(int16_t)) */,
+    (xnn_indirection_init_resize_bilinear2d_hwc_fn) xnn_indirection_init_resize_bilinear2d_hwc_q11,
+    &xnn_params.s8.ibilinear,
+    pthreadpool_get_threads_count(threadpool));
+}
+
+enum xnn_status xnn_setup_resize_bilinear2d_nhwc_u8(
+    xnn_operator_t resize_op,
+    size_t batch_size,
+    size_t input_height,
+    size_t input_width,
+    size_t output_height,
+    size_t output_width,
+    const uint8_t* input,
+    uint8_t* output,
+    pthreadpool_t threadpool)
+{
+  return setup_resize_bilinear2d_nhwc(
+    resize_op,
+    xnn_operator_type_resize_bilinear_nhwc_u8,
+    batch_size,
+    input_height,
+    input_width,
+    output_height,
+    output_width,
+    input,
+    output,
+    0 /* log2(element size) == log2(sizeof(uint8_t)) */,
+    1 /* log2(weight element size) == log2(sizeof(int16_t)) */,
+    (xnn_indirection_init_resize_bilinear2d_hwc_fn) xnn_indirection_init_resize_bilinear2d_hwc_q11,
+    &xnn_params.u8.ibilinear,
+    pthreadpool_get_threads_count(threadpool));
+}
diff --git a/src/xnnpack/indirection.h b/src/xnnpack/indirection.h
index a342df2..67bda52 100644
--- a/src/xnnpack/indirection.h
+++ b/src/xnnpack/indirection.h
@@ -58,6 +58,30 @@
   bool align_corners,
   bool tensorflow_legacy);
 
+typedef void (*xnn_indirection_init_resize_bilinear2d_hwc_fn)(
+  size_t input_pixel_stride,
+  size_t input_height,
+  size_t input_width,
+  size_t output_height,
+  size_t output_width,
+  const void* input,
+  const void** indirection_buffer,
+  void* packed_weights,
+  bool align_corners,
+  bool tensorflow_legacy);
+
+XNN_INTERNAL void xnn_indirection_init_resize_bilinear2d_hwc_q11(
+  size_t input_pixel_stride,
+  size_t input_height,
+  size_t input_width,
+  size_t output_height,
+  size_t output_width,
+  const void* input,
+  const void** indirection_buffer,
+  int16_t* packed_weights,
+  bool align_corners,
+  bool tensorflow_legacy);
+
 XNN_INTERNAL void xnn_indirection_init_resize_bilinear2d_chw_f32(
   size_t input_pixel_stride,
   size_t input_height,
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index bf23dfd..7c0a4a0 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -93,6 +93,8 @@
   xnn_operator_type_prelu_nc_f32,
   xnn_operator_type_resize_bilinear_nchw_f32,
   xnn_operator_type_resize_bilinear_nhwc_f32,
+  xnn_operator_type_resize_bilinear_nhwc_s8,
+  xnn_operator_type_resize_bilinear_nhwc_u8,
   xnn_operator_type_sigmoid_nc_f32,
   xnn_operator_type_sigmoid_nc_qs8,
   xnn_operator_type_sigmoid_nc_qu8,