Unify implementation of unary elementwise operators (Clamp, HardSwish, Sigmoid)

PiperOrigin-RevId: 314793689
diff --git a/BUILD.bazel b/BUILD.bazel
index 54c0636..5343e32 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -56,7 +56,6 @@
     "src/operators/binary-elementwise-nd.c",
     "src/operators/channel-pad-nc.c",
     "src/operators/channel-shuffle-nc.c",
-    "src/operators/clamp-nc.c",
     "src/operators/constant-pad-nd.c",
     "src/operators/convolution-nchw.c",
     "src/operators/convolution-nhwc.c",
@@ -64,13 +63,13 @@
     "src/operators/fully-connected-nc.c",
     "src/operators/global-average-pooling-ncw.c",
     "src/operators/global-average-pooling-nwc.c",
-    "src/operators/hardswish-nc.c",
     "src/operators/leaky-relu-nc.c",
     "src/operators/max-pooling-nhwc.c",
     "src/operators/prelu-nc.c",
     "src/operators/resize-bilinear-nhwc.c",
     "src/operators/sigmoid-nc.c",
     "src/operators/softmax-nc.c",
+    "src/operators/unary-elementwise-nc.c",
     "src/operators/unpooling-nhwc.c",
 ]
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5c623e4..de63c9a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -164,7 +164,6 @@
   src/operators/binary-elementwise-nd.c
   src/operators/channel-pad-nc.c
   src/operators/channel-shuffle-nc.c
-  src/operators/clamp-nc.c
   src/operators/constant-pad-nd.c
   src/operators/convolution-nchw.c
   src/operators/convolution-nhwc.c
@@ -172,13 +171,13 @@
   src/operators/fully-connected-nc.c
   src/operators/global-average-pooling-ncw.c
   src/operators/global-average-pooling-nwc.c
-  src/operators/hardswish-nc.c
   src/operators/leaky-relu-nc.c
   src/operators/max-pooling-nhwc.c
   src/operators/prelu-nc.c
   src/operators/resize-bilinear-nhwc.c
   src/operators/sigmoid-nc.c
   src/operators/softmax-nc.c
+  src/operators/unary-elementwise-nd.c
   src/operators/unpooling-nhwc.c)
 
 SET(XNNPACK_SUBGRAPH_SRCS
diff --git a/src/operators/add-nc.c b/src/operators/add-nc.c
index dcbf36e..916e62e 100644
--- a/src/operators/add-nc.c
+++ b/src/operators/add-nc.c
@@ -138,7 +138,7 @@
   add_op->input_pixel_stride = a_stride;
   add_op->input2_pixel_stride = b_stride;
   add_op->output_pixel_stride = sum_stride;
-  add_op->q8_add_params =
+  add_op->params.q8_add =
     xnn_init_q8_add_params(
       a_zero_point, b_zero_point, sum_zero_point,
       a_scale / sum_scale, b_scale / sum_scale,
@@ -244,7 +244,7 @@
   add_op->input_pixel_stride = a_stride;
   add_op->input2_pixel_stride = b_stride;
   add_op->output_pixel_stride = sum_stride;
-  add_op->f32_minmax_params = xnn_init_f32_minmax_params(sum_min, sum_max);
+  add_op->params.f32_minmax = xnn_init_f32_minmax_params(sum_min, sum_max);
 
   add_op->type = xnn_operator_type_add_nc_f32;
   add_op->ukernel.type = xnn_ukernel_type_add;
@@ -296,7 +296,7 @@
       .a = a,
       .b = b,
       .y = sum,
-      .params.q8 = add_op->q8_add_params,
+      .params.q8 = add_op->params.q8_add,
       .ukernel = xnn_params.q8.vadd,
     };
     add_op->compute.type = xnn_parallelization_type_1d_tile_1d;
@@ -312,7 +312,7 @@
       .y = sum,
       .y_stride = sum_stride * sizeof(uint8_t),
       .n = channels,
-      .params.q8 = add_op->q8_add_params,
+      .params.q8 = add_op->params.q8_add,
       .ukernel = xnn_params.q8.vadd,
     };
     add_op->compute.type = xnn_parallelization_type_1d_tile_1d;
@@ -362,7 +362,7 @@
       .a = a,
       .b = b,
       .y = sum,
-      .params.f32 = add_op->f32_minmax_params,
+      .params.f32 = add_op->params.f32_minmax,
       .ukernel = xnn_params.f32.vadd.op_ukernel,
     };
     add_op->compute.type = xnn_parallelization_type_1d_tile_1d;
@@ -378,7 +378,7 @@
       .y = sum,
       .y_stride = sum_stride * sizeof(float),
       .n = channels * sizeof(float),
-      .params.f32 = add_op->f32_minmax_params,
+      .params.f32 = add_op->params.f32_minmax,
       .ukernel = xnn_params.f32.vadd.op_ukernel,
     };
     add_op->compute.type = xnn_parallelization_type_1d_tile_1d;
diff --git a/src/operators/argmax-pooling-nhwc.c b/src/operators/argmax-pooling-nhwc.c
index 907f39d..b7aa3f3 100644
--- a/src/operators/argmax-pooling-nhwc.c
+++ b/src/operators/argmax-pooling-nhwc.c
@@ -163,7 +163,7 @@
   argmax_pooling_op->input_pixel_stride = input_pixel_stride;
   argmax_pooling_op->output_pixel_stride = output_pixel_stride;
 
-  argmax_pooling_op->f32_minmax_params = xnn_init_f32_minmax_params(output_min, output_max);
+  argmax_pooling_op->params.f32_minmax = xnn_init_f32_minmax_params(output_min, output_max);
 
   argmax_pooling_op->type = xnn_operator_type_argmax_pooling_nhwc_f32;
   argmax_pooling_op->ukernel.type = xnn_ukernel_type_argmax_pooling;
@@ -299,7 +299,7 @@
     .channels = channels,
     .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
     .output_increment = output_width_stride - channels * sizeof(float),
-    .params.f32 = argmax_pooling_op->f32_minmax_params,
+    .params.f32 = argmax_pooling_op->params.f32_minmax,
   };
   argmax_pooling_op->compute.type = xnn_parallelization_type_2d;
   argmax_pooling_op->compute.range[0] = batch_size;
diff --git a/src/operators/average-pooling-nhwc.c b/src/operators/average-pooling-nhwc.c
index d03ba22..9be0c06 100644
--- a/src/operators/average-pooling-nhwc.c
+++ b/src/operators/average-pooling-nhwc.c
@@ -218,7 +218,7 @@
   // Number of rows read in the AVGPOOL micro-kernel.
   const size_t avgpool_nrows =
     round_up(doz(pooling_size, xnn_params.q8.avgpool.mr), xnn_params.q8.avgpool.qr) + xnn_params.q8.avgpool.mr;
-  average_pooling_op->q8_avgpool_params =
+  average_pooling_op->params.q8_avgpool =
     xnn_init_q8_avgpool_params(
       (int32_t) -((uint32_t) input_zero_point * (uint32_t) avgpool_nrows),
       input_scale / (output_scale * (float) pooling_size),
@@ -379,11 +379,11 @@
   average_pooling_op->output_pixel_stride = output_pixel_stride;
 
   average_pooling_op->type = xnn_operator_type_average_pooling_nhwc_f32;
-  average_pooling_op->f32_scaleminmax_params =
+  average_pooling_op->params.f32_scaleminmax =
     xnn_init_f32_scaleminmax_params(1.0f / (float) pooling_size, output_min, output_max);
   const bool tf_same_padding = (flags & XNN_FLAG_TENSORFLOW_SAME_PADDING) != 0;
   if (any_padding || tf_same_padding) {
-    average_pooling_op->f32_minmax_params =
+    average_pooling_op->params.f32_minmax =
       xnn_init_f32_minmax_params(output_min, output_max);
     average_pooling_op->ukernel.type = xnn_ukernel_type_pixelwise_average_pooling;
   } else {
@@ -620,7 +620,7 @@
         .zero = average_pooling_op->zero_buffer,
         .input_increment = (pooling_height * step_width - multipass_adjustment) * sizeof(void*),
         .output_increment = output_width_stride - (channels << log2_output_element_size),
-        .params.f32 = average_pooling_op->f32_scaleminmax_params,
+        .params.f32 = average_pooling_op->params.f32_scaleminmax,
       };
       memcpy(&average_pooling_op->context.average_pooling.params, params, params_size);
       if (pooling_size <= mr) {
@@ -662,7 +662,7 @@
   const size_t input_size = input_height * input_width;
   const size_t pooling_size = average_pooling_op->kernel_height * average_pooling_op->kernel_width;
   const size_t gavgpool_nrows = round_up(input_size, xnn_params.q8.gavgpool.mr);
-  average_pooling_op->q8_gavgpool_params =
+  average_pooling_op->params.q8_gavgpool =
     xnn_init_q8_avgpool_params(
       (int32_t) -((uint32_t) average_pooling_op->input_zero_point * (uint32_t) gavgpool_nrows),
       average_pooling_op->input_scale / (average_pooling_op->output_scale * (float) pooling_size),
@@ -679,10 +679,10 @@
     &xnn_params.q8.avgpool,
     NULL /* no PAVGPOOL micro-kernel */,
     &xnn_params.q8.gavgpool,
-    &average_pooling_op->q8_avgpool_params,
-    sizeof(average_pooling_op->q8_avgpool_params),
-    &average_pooling_op->q8_gavgpool_params,
-    sizeof(average_pooling_op->q8_gavgpool_params),
+    &average_pooling_op->params.q8_avgpool,
+    sizeof(average_pooling_op->params.q8_avgpool),
+    &average_pooling_op->params.q8_gavgpool,
+    sizeof(average_pooling_op->params.q8_gavgpool),
     pthreadpool_get_threads_count(threadpool),
     false /* pixelwise not supported */);
 }
@@ -709,7 +709,7 @@
   const bool is_pixelwise = average_pooling_op->ukernel.type == xnn_ukernel_type_pixelwise_average_pooling;
   if (is_pixelwise) {
     const size_t input_size = input_height * input_width;
-    xnn_update_f32_scaleminmax_params(&average_pooling_op->f32_scaleminmax_params, 1.0f / (float) input_size);
+    xnn_update_f32_scaleminmax_params(&average_pooling_op->params.f32_scaleminmax, 1.0f / (float) input_size);
   }
 
   return setup_average_pooling2d(
@@ -721,10 +721,10 @@
     &xnn_params.f32.avgpool,
     &xnn_params.f32.pavgpool,
     &xnn_params.f32.gavgpool,
-    is_pixelwise ? (const void*) &average_pooling_op->f32_minmax_params : (const void*) &average_pooling_op->f32_scaleminmax_params,
-    is_pixelwise ? sizeof(average_pooling_op->f32_minmax_params) : sizeof(average_pooling_op->f32_scaleminmax_params),
-    &average_pooling_op->f32_scaleminmax_params,
-    sizeof(average_pooling_op->f32_scaleminmax_params),
+    is_pixelwise ? (const void*) &average_pooling_op->params.f32_minmax : (const void*) &average_pooling_op->params.f32_scaleminmax,
+    is_pixelwise ? sizeof(average_pooling_op->params.f32_minmax) : sizeof(average_pooling_op->params.f32_scaleminmax),
+    &average_pooling_op->params.f32_scaleminmax,
+    sizeof(average_pooling_op->params.f32_scaleminmax),
     pthreadpool_get_threads_count(threadpool),
     is_pixelwise);
 }
diff --git a/src/operators/binary-elementwise-nd.c b/src/operators/binary-elementwise-nd.c
index cb9ee1e..8d1927e 100644
--- a/src/operators/binary-elementwise-nd.c
+++ b/src/operators/binary-elementwise-nd.c
@@ -66,7 +66,7 @@
     goto error;
   }
 
-  binary_elementwise_op->f32_minmax_params = xnn_init_f32_minmax_params(output_min, output_max);
+  binary_elementwise_op->params.f32_minmax = xnn_init_f32_minmax_params(output_min, output_max);
 
   binary_elementwise_op->type = operator_type;
   binary_elementwise_op->ukernel.type = xnn_ukernel_type_binary_elementwise;
@@ -274,7 +274,7 @@
     .b = input2,
     .y = output,
     .elements = compressed_output_shape[0] * sizeof(float),
-    .params.f32 = binary_elementwise_op->f32_minmax_params,
+    .params.f32 = binary_elementwise_op->params.f32_minmax,
   };
   const size_t* compressed_a_shape = compressed_input1_shape;
   const size_t* compressed_b_shape = compressed_input2_shape;
diff --git a/src/operators/clamp-nc.c b/src/operators/clamp-nc.c
deleted file mode 100644
index f3243a5..0000000
--- a/src/operators/clamp-nc.c
+++ /dev/null
@@ -1,296 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <math.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include <xnnpack.h>
-#include <xnnpack/allocator.h>
-#include <xnnpack/log.h>
-#include <xnnpack/operator.h>
-#include <xnnpack/params-init.h>
-#include <xnnpack/params.h>
-
-
-enum xnn_status xnn_create_clamp_nc_u8(
-    size_t channels,
-    size_t input_stride,
-    size_t output_stride,
-    uint8_t output_min,
-    uint8_t output_max,
-    uint32_t flags,
-    xnn_operator_t* clamp_op_out)
-{
-  xnn_operator_t clamp_op = NULL;
-  enum xnn_status status = xnn_status_uninitialized;
-
-  if (!xnn_params.initialized) {
-    xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_u8));
-    goto error;
-  }
-
-  status = xnn_status_invalid_parameter;
-
-  if (channels == 0) {
-    xnn_log_error(
-      "failed to create %s operator with %zu channels: number of channels must be non-zero",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_u8), channels);
-    goto error;
-  }
-
-  if (input_stride < channels) {
-    xnn_log_error(
-      "failed to create %s operator with input element stride of %zu: "
-      "stride must be at least as large as the number of channels (%zu)",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_u8), input_stride, channels);
-    goto error;
-  }
-
-  if (output_stride < channels) {
-    xnn_log_error(
-      "failed to create %s operator with output element stride of %zu: "
-      "stride must be at least as large as the number of channels (%zu)",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_u8), output_stride, channels);
-    goto error;
-  }
-
-  if (output_min >= output_max) {
-    xnn_log_error(
-      "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_u8), output_min, output_max);
-    goto error;
-  }
-
-  status = xnn_status_out_of_memory;
-
-  clamp_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
-  if (clamp_op == NULL) {
-    xnn_log_error(
-      "failed to allocate %zu bytes for %s operator descriptor",
-      sizeof(struct xnn_operator), xnn_operator_type_to_string(xnn_operator_type_clamp_nc_u8));
-    goto error;
-  }
-
-  clamp_op->channels = channels;
-  clamp_op->input_pixel_stride = input_stride;
-  clamp_op->output_pixel_stride = output_stride;
-  clamp_op->u8_minmax_params = xnn_init_u8_minmax_params(output_min, output_max);
-
-  clamp_op->type = xnn_operator_type_clamp_nc_u8;
-  clamp_op->ukernel.type = xnn_ukernel_type_clamp;
-
-  clamp_op->state = xnn_run_state_invalid;
-
-  *clamp_op_out = clamp_op;
-  return xnn_status_success;
-
-error:
-  xnn_delete_operator(clamp_op);
-  return status;
-}
-
-enum xnn_status xnn_create_clamp_nc_f32(
-    size_t channels,
-    size_t input_stride,
-    size_t output_stride,
-    float output_min,
-    float output_max,
-    uint32_t flags,
-    xnn_operator_t* clamp_op_out)
-{
-  xnn_operator_t clamp_op = NULL;
-  enum xnn_status status = xnn_status_uninitialized;
-
-  if (!xnn_params.initialized) {
-    xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32));
-    goto error;
-  }
-
-  status = xnn_status_invalid_parameter;
-
-  if (channels == 0) {
-    xnn_log_error(
-      "failed to create %s operator with %zu channels: number of channels must be non-zero",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), channels);
-    goto error;
-  }
-
-  if (input_stride < channels) {
-    xnn_log_error(
-      "failed to create %s operator with input element stride of %zu: "
-      "stride must be at least as large as the number of channels (%zu)",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), input_stride, channels);
-    goto error;
-  }
-
-  if (output_stride < channels) {
-    xnn_log_error(
-      "failed to create %s operator with output element stride of %zu: "
-      "stride must be at least as large as the number of channels (%zu)",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), output_stride, channels);
-    goto error;
-  }
-
-  if (isnan(output_min)) {
-    xnn_log_error(
-      "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32));
-    goto error;
-  }
-
-  if (isnan(output_max)) {
-    xnn_log_error(
-      "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32));
-    goto error;
-  }
-
-  if (output_min >= output_max) {
-    xnn_log_error(
-      "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), output_min, output_max);
-    goto error;
-  }
-
-  status = xnn_status_out_of_memory;
-
-  clamp_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
-  if (clamp_op == NULL) {
-    xnn_log_error(
-      "failed to allocate %zu bytes for %s operator descriptor",
-      sizeof(struct xnn_operator), xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32));
-    goto error;
-  }
-
-  clamp_op->channels = channels;
-  clamp_op->input_pixel_stride = input_stride;
-  clamp_op->output_pixel_stride = output_stride;
-  clamp_op->f32_minmax_params = xnn_init_f32_minmax_params(output_min, output_max);
-
-  clamp_op->type = xnn_operator_type_clamp_nc_f32;
-  clamp_op->ukernel.type = xnn_ukernel_type_clamp;
-
-  clamp_op->state = xnn_run_state_invalid;
-
-  *clamp_op_out = clamp_op;
-  return xnn_status_success;
-
-error:
-  xnn_delete_operator(clamp_op);
-  return status;
-}
-
-static enum xnn_status setup_clamp(
-    xnn_operator_t clamp_op,
-    size_t batch_size,
-    const void* input,
-    void* output,
-    xnn_univector_ukernel_function ukernel,
-    uint32_t log2_element_size,
-    const void* params,
-    size_t params_size)
-{
-  if (!xnn_params.initialized) {
-    xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(clamp_op->type));
-    return xnn_status_uninitialized;
-  }
-
-  if (batch_size == 0) {
-    clamp_op->state = xnn_run_state_skip;
-    return xnn_status_success;
-  }
-
-  const size_t channels = clamp_op->channels;
-  const size_t input_stride = clamp_op->input_pixel_stride;
-  const size_t output_stride = clamp_op->output_pixel_stride;
-  if ((((input_stride ^ channels) | (output_stride ^ channels)) == 0) || batch_size == 1) {
-    const size_t block_size = 4096;
-    clamp_op->context.univector_contiguous = (struct univector_contiguous_context) {
-      .x = input,
-      .x_stride = input_stride << log2_element_size,
-      .y = output,
-      .y_stride = output_stride << log2_element_size,
-      .ukernel = ukernel,
-    };
-    memcpy(&clamp_op->context.univector_contiguous.params, params, params_size);
-    clamp_op->compute.type = xnn_parallelization_type_1d_tile_1d;
-    clamp_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_univector_contiguous;
-    clamp_op->compute.range[0] = (batch_size * channels) << log2_element_size;
-    clamp_op->compute.tile[0] = block_size;
-  } else {
-    clamp_op->context.univector_strided = (struct univector_strided_context) {
-      .n = channels << log2_element_size,
-      .x = input,
-      .x_stride = input_stride << log2_element_size,
-      .y = output,
-      .y_stride = output_stride << log2_element_size,
-      .ukernel = ukernel,
-    };
-    memcpy(&clamp_op->context.univector_strided.params, params, params_size);
-    clamp_op->compute.type = xnn_parallelization_type_1d_tile_1d;
-    clamp_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_univector_strided;
-    clamp_op->compute.range[0] = batch_size;
-    clamp_op->compute.tile[0] = 1;
-  }
-  clamp_op->state = xnn_run_state_ready;
-
-  return xnn_status_success;
-}
-
-enum xnn_status xnn_setup_clamp_nc_u8(
-    xnn_operator_t clamp_op,
-    size_t batch_size,
-    const uint8_t* input,
-    uint8_t* output,
-    pthreadpool_t threadpool)
-{
-  if (clamp_op->type != xnn_operator_type_clamp_nc_u8) {
-    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_u8),
-      xnn_operator_type_to_string(clamp_op->type));
-    return xnn_status_invalid_parameter;
-  }
-  clamp_op->state = xnn_run_state_invalid;
-
-  return setup_clamp(
-    clamp_op,
-    batch_size, input, output,
-    xnn_params.u8.clamp,
-    0 /* log2(sizeof(uint8_t)) */,
-    &clamp_op->u8_minmax_params,
-    sizeof(clamp_op->u8_minmax_params));
-}
-
-enum xnn_status xnn_setup_clamp_nc_f32(
-    xnn_operator_t clamp_op,
-    size_t batch_size,
-    const float* input,
-    float* output,
-    pthreadpool_t threadpool)
-{
-  if (clamp_op->type != xnn_operator_type_clamp_nc_f32) {
-    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
-      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32),
-      xnn_operator_type_to_string(clamp_op->type));
-    return xnn_status_invalid_parameter;
-  }
-  clamp_op->state = xnn_run_state_invalid;
-
-  return setup_clamp(
-    clamp_op,
-    batch_size, input, output,
-    xnn_params.f32.clamp,
-    2 /* log2(sizeof(float)) */,
-    &clamp_op->f32_minmax_params,
-    sizeof(clamp_op->f32_minmax_params));
-}
diff --git a/src/operators/convolution-nchw.c b/src/operators/convolution-nchw.c
index cffd251..765d40a 100644
--- a/src/operators/convolution-nchw.c
+++ b/src/operators/convolution-nchw.c
@@ -458,9 +458,9 @@
   convolution_op->group_output_channels = group_output_channels;
 
   if (ukernel_type == xnn_ukernel_type_dwconv) {
-    convolution_op->f32_chw_params = xnn_init_f32_chw_params(0, output_min, output_max);
+    convolution_op->params.f32_chw = xnn_init_f32_chw_params(0, output_min, output_max);
   } else {
-    convolution_op->f32_minmax_params = xnn_init_f32_minmax_params(output_min, output_max);
+    convolution_op->params.f32_minmax = xnn_init_f32_minmax_params(output_min, output_max);
   }
 
   convolution_op->type = xnn_operator_type_convolution_nchw_f32;
@@ -738,7 +738,7 @@
     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
     sizeof(float) /* sizeof(bias element) */,
     2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
-    &convolution_op->f32_minmax_params,
-    &convolution_op->f32_chw_params,
+    &convolution_op->params.f32_minmax,
+    &convolution_op->params.f32_chw,
     pthreadpool_get_threads_count(threadpool));
 }
diff --git a/src/operators/convolution-nhwc.c b/src/operators/convolution-nhwc.c
index c740ba8..24e41a8 100644
--- a/src/operators/convolution-nhwc.c
+++ b/src/operators/convolution-nhwc.c
@@ -381,7 +381,7 @@
 
   convolution_op->kernel_zero_point = kernel_zero_point;
 
-  convolution_op->q8_gemm_params =
+  convolution_op->params.q8_gemm =
     xnn_init_q8_gemm_params(
       input_zero_point, kernel_zero_point,
       convolution_scale, output_zero_point, output_min, output_max);
@@ -732,7 +732,7 @@
   convolution_op->input_pixel_stride = input_pixel_stride;
   convolution_op->output_pixel_stride = output_pixel_stride;
 
-  convolution_op->f32_minmax_params = xnn_init_f32_minmax_params(output_min, output_max);
+  convolution_op->params.f32_minmax = xnn_init_f32_minmax_params(output_min, output_max);
 
   convolution_op->type = xnn_operator_type_convolution_nhwc_f32;
   convolution_op->ukernel.type = ukernel_type;
@@ -1148,7 +1148,7 @@
     0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
     sizeof(int32_t) /* sizeof(bias element) */,
     0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
-    &convolution_op->q8_gemm_params,
+    &convolution_op->params.q8_gemm,
     pthreadpool_get_threads_count(threadpool));
 }
 
@@ -1176,6 +1176,6 @@
     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
     sizeof(float) /* sizeof(bias element) */,
     2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
-    &convolution_op->f32_minmax_params,
+    &convolution_op->params.f32_minmax,
     pthreadpool_get_threads_count(threadpool));
 }
diff --git a/src/operators/deconvolution-nhwc.c b/src/operators/deconvolution-nhwc.c
index 9514e52..9f080f8 100644
--- a/src/operators/deconvolution-nhwc.c
+++ b/src/operators/deconvolution-nhwc.c
@@ -298,7 +298,7 @@
 
   deconvolution_op->kernel_zero_point = kernel_zero_point;
 
-  deconvolution_op->q8_gemm_params =
+  deconvolution_op->params.q8_gemm =
     xnn_init_q8_gemm_params(
       input_zero_point, kernel_zero_point,
       deconvolution_scale, output_zero_point, output_min, output_max);
@@ -584,7 +584,7 @@
   deconvolution_op->input_pixel_stride = input_pixel_stride;
   deconvolution_op->output_pixel_stride = output_pixel_stride;
 
-  deconvolution_op->f32_minmax_params = xnn_init_f32_minmax_params(output_min, output_max);
+  deconvolution_op->params.f32_minmax = xnn_init_f32_minmax_params(output_min, output_max);
 
   deconvolution_op->type = xnn_operator_type_deconvolution_nhwc_f32;
   deconvolution_op->ukernel.type = ukernel_type;
@@ -1037,7 +1037,7 @@
     0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
     sizeof(int32_t) /* sizeof(bias element) */,
     0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
-    &deconvolution_op->q8_gemm_params,
+    &deconvolution_op->params.q8_gemm,
     pthreadpool_get_threads_count(threadpool));
 }
 
@@ -1068,6 +1068,6 @@
     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
     sizeof(float) /* sizeof(bias element) */,
     2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
-    &deconvolution_op->f32_minmax_params,
+    &deconvolution_op->params.f32_minmax,
     pthreadpool_get_threads_count(threadpool));
 }
diff --git a/src/operators/fully-connected-nc.c b/src/operators/fully-connected-nc.c
index 85cd577..61c2cf1 100644
--- a/src/operators/fully-connected-nc.c
+++ b/src/operators/fully-connected-nc.c
@@ -169,7 +169,7 @@
 
   fully_connected_op->kernel_zero_point = kernel_zero_point;
 
-  fully_connected_op->q8_gemm_params =
+  fully_connected_op->params.q8_gemm =
     xnn_init_q8_gemm_params(
       input_zero_point, kernel_zero_point,
       requantization_scale, output_zero_point, output_min, output_max);
@@ -314,7 +314,7 @@
   fully_connected_op->input_pixel_stride = input_stride;
   fully_connected_op->output_pixel_stride = output_stride;
 
-  fully_connected_op->f32_minmax_params = xnn_init_f32_minmax_params(output_min, output_max);
+  fully_connected_op->params.f32_minmax = xnn_init_f32_minmax_params(output_min, output_max);
 
   fully_connected_op->type = xnn_operator_type_fully_connected_nc_f32;
 
@@ -445,7 +445,7 @@
     0 /* log2(sizeof(filter element)) = log2(sizeof(uint8_t)) */,
     sizeof(int32_t) /* sizeof(bias element) */,
     0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
-    &fully_connected_op->q8_gemm_params,
+    &fully_connected_op->params.q8_gemm,
     pthreadpool_get_threads_count(threadpool));
 }
 
@@ -471,6 +471,6 @@
     2 /* log2(sizeof(filter element)) = log2(sizeof(float)) */,
     sizeof(float) /* sizeof(bias element) */,
     2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
-    &fully_connected_op->f32_minmax_params,
+    &fully_connected_op->params.f32_minmax,
     pthreadpool_get_threads_count(threadpool));
 }
diff --git a/src/operators/global-average-pooling-ncw.c b/src/operators/global-average-pooling-ncw.c
index 80076b2..771128e 100644
--- a/src/operators/global-average-pooling-ncw.c
+++ b/src/operators/global-average-pooling-ncw.c
@@ -82,7 +82,7 @@
   }
 
   global_average_pooling_op->channels = channels;
-  global_average_pooling_op->f32_gavgpool_params = xnn_init_f32_gavgpool_params(nanf(""), output_min, output_max, 0);
+  global_average_pooling_op->params.f32_gavgpool = xnn_init_f32_gavgpool_params(nanf(""), output_min, output_max, 0);
 
   global_average_pooling_op->type = xnn_operator_type_global_average_pooling_ncw_f32;
   global_average_pooling_op->ukernel.type = xnn_ukernel_type_global_average_pooling;
@@ -131,7 +131,7 @@
     return xnn_status_success;
   }
 
-  xnn_update_f32_gavgpool_params(&global_average_pooling_op->f32_gavgpool_params,
+  xnn_update_f32_gavgpool_params(&global_average_pooling_op->params.f32_gavgpool,
     1.0f / (float) width, width);
 
   global_average_pooling_op->context.global_average_pooling_ncw = (struct global_average_pooling_ncw_context) {
@@ -143,7 +143,7 @@
     .output_channel_stride = sizeof(float),
     .output_batch_stride = global_average_pooling_op->channels * sizeof(float),
     .ukernel = xnn_params.f32.gavgpool_cw.ukernel,
-    .params.f32 = global_average_pooling_op->f32_gavgpool_params,
+    .params.f32 = global_average_pooling_op->params.f32_gavgpool,
   };
 
   global_average_pooling_op->compute.type = xnn_parallelization_type_2d_tile_1d;
diff --git a/src/operators/global-average-pooling-nwc.c b/src/operators/global-average-pooling-nwc.c
index 0ca860b..b0cd714 100644
--- a/src/operators/global-average-pooling-nwc.c
+++ b/src/operators/global-average-pooling-nwc.c
@@ -228,7 +228,7 @@
   global_average_pooling_op->channels = channels;
   global_average_pooling_op->input_pixel_stride = input_stride;
   global_average_pooling_op->output_pixel_stride = output_stride;
-  global_average_pooling_op->f32_scaleminmax_params = xnn_init_f32_scaleminmax_params(nanf(""), output_min, output_max);
+  global_average_pooling_op->params.f32_scaleminmax = xnn_init_f32_scaleminmax_params(nanf(""), output_min, output_max);
 
   global_average_pooling_op->type = xnn_operator_type_global_average_pooling_nwc_f32;
   global_average_pooling_op->ukernel.type = xnn_ukernel_type_global_average_pooling;
@@ -281,7 +281,7 @@
   global_average_pooling_op->input = input;
   global_average_pooling_op->output = output;
 
-  global_average_pooling_op->q8_avgpool_params =
+  global_average_pooling_op->params.q8_avgpool =
     xnn_init_q8_avgpool_params(
       -(int32_t) width * (int32_t) (uint32_t) global_average_pooling_op->input_zero_point,
       global_average_pooling_op->input_scale / (global_average_pooling_op->output_scale * (float) width),
@@ -300,7 +300,7 @@
       .channels = channels,
       .output = output,
       .output_batch_stride = global_average_pooling_op->output_pixel_stride * sizeof(uint8_t),
-      .params.q8 = global_average_pooling_op->q8_avgpool_params,
+      .params.q8 = global_average_pooling_op->params.q8_avgpool,
   };
   global_average_pooling_op->compute.type = xnn_parallelization_type_1d;
   global_average_pooling_op->compute.range[0] = batch_size;
@@ -355,7 +355,7 @@
   global_average_pooling_op->input = input;
   global_average_pooling_op->output = output;
 
-  xnn_update_f32_scaleminmax_params(&global_average_pooling_op->f32_scaleminmax_params, 1.0f / (float) width);
+  xnn_update_f32_scaleminmax_params(&global_average_pooling_op->params.f32_scaleminmax, 1.0f / (float) width);
 
   const size_t input_stride_in_bytes = global_average_pooling_op->input_pixel_stride * sizeof(float);
   const size_t channels = global_average_pooling_op->channels;
@@ -368,7 +368,7 @@
       .channels = channels,
       .output = output,
       .output_batch_stride = global_average_pooling_op->output_pixel_stride * sizeof(float),
-      .params.f32 = global_average_pooling_op->f32_scaleminmax_params,
+      .params.f32 = global_average_pooling_op->params.f32_scaleminmax,
   };
   global_average_pooling_op->compute.type = xnn_parallelization_type_1d;
   global_average_pooling_op->compute.range[0] = batch_size;
diff --git a/src/operators/hardswish-nc.c b/src/operators/hardswish-nc.c
deleted file mode 100644
index 46adba3..0000000
--- a/src/operators/hardswish-nc.c
+++ /dev/null
@@ -1,149 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <math.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdlib.h>
-
-#include <xnnpack.h>
-#include <xnnpack/allocator.h>
-#include <xnnpack/log.h>
-#include <xnnpack/operator.h>
-#include <xnnpack/params-init.h>
-#include <xnnpack/params.h>
-
-
-enum xnn_status xnn_create_hardswish_nc_f32(
-    size_t channels,
-    size_t input_stride,
-    size_t output_stride,
-    uint32_t flags,
-    xnn_operator_t* hardswish_op_out)
-{
-  xnn_operator_t hardswish_op = NULL;
-  enum xnn_status status = xnn_status_uninitialized;
-
-  if (!xnn_params.initialized) {
-    xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(xnn_operator_type_hardswish_nc_f32));
-    goto error;
-  }
-
-  status = xnn_status_invalid_parameter;
-
-  if (channels == 0) {
-    xnn_log_error(
-      "failed to create %s operator with %zu channels: number of channels must be non-zero",
-      xnn_operator_type_to_string(xnn_operator_type_hardswish_nc_f32), channels);
-    goto error;
-  }
-
-  if (input_stride < channels) {
-    xnn_log_error(
-      "failed to create %s operator with input element stride of %zu: "
-      "stride must be at least as large as the number of channels (%zu)",
-      xnn_operator_type_to_string(xnn_operator_type_hardswish_nc_f32), input_stride, channels);
-    goto error;
-  }
-
-  if (output_stride < channels) {
-    xnn_log_error(
-      "failed to create %s operator with output element stride of %zu: "
-      "stride must be at least as large as the number of channels (%zu)",
-      xnn_operator_type_to_string(xnn_operator_type_hardswish_nc_f32), output_stride, channels);
-    goto error;
-  }
-
-  status = xnn_status_out_of_memory;
-
-  hardswish_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
-  if (hardswish_op == NULL) {
-    xnn_log_error(
-      "failed to allocate %zu bytes for %s operator descriptor",
-      sizeof(struct xnn_operator), xnn_operator_type_to_string(xnn_operator_type_hardswish_nc_f32));
-    goto error;
-  }
-
-  hardswish_op->channels = channels;
-  hardswish_op->input_pixel_stride = input_stride;
-  hardswish_op->output_pixel_stride = output_stride;
-  hardswish_op->f32_hswish_params = xnn_init_f32_hswish_params();
-
-  hardswish_op->type = xnn_operator_type_hardswish_nc_f32;
-  hardswish_op->ukernel.type = xnn_ukernel_type_hswish;
-
-  hardswish_op->state = xnn_run_state_invalid;
-
-  *hardswish_op_out = hardswish_op;
-  return xnn_status_success;
-
-error:
-  xnn_delete_operator(hardswish_op);
-  return status;
-}
-
-enum xnn_status xnn_setup_hardswish_nc_f32(
-    xnn_operator_t hardswish_op,
-    size_t batch_size,
-    const float* input,
-    float* output,
-    pthreadpool_t threadpool)
-{
-  if (hardswish_op->type != xnn_operator_type_hardswish_nc_f32) {
-    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
-      xnn_operator_type_to_string(xnn_operator_type_hardswish_nc_f32),
-      xnn_operator_type_to_string(hardswish_op->type));
-    return xnn_status_invalid_parameter;
-  }
-  hardswish_op->state = xnn_run_state_invalid;
-
-  if (!xnn_params.initialized) {
-    xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(xnn_operator_type_hardswish_nc_f32));
-    return xnn_status_uninitialized;
-  }
-
-  if (batch_size == 0) {
-    hardswish_op->state = xnn_run_state_skip;
-    return xnn_status_success;
-  }
-
-  const size_t channels = hardswish_op->channels;
-  const size_t input_stride = hardswish_op->input_pixel_stride;
-  const size_t output_stride = hardswish_op->output_pixel_stride;
-  if ((((input_stride ^ channels) | (output_stride ^ channels)) == 0) || batch_size == 1) {
-    const size_t block_size = 4096;
-    hardswish_op->context.univector_contiguous = (struct univector_contiguous_context) {
-      .x = input,
-      .x_stride = input_stride * sizeof(float),
-      .y = output,
-      .y_stride = output_stride * sizeof(float),
-      .ukernel = xnn_params.f32.hswish,
-      .params.f32_hswish = hardswish_op->f32_hswish_params,
-    };
-    hardswish_op->compute.type = xnn_parallelization_type_1d_tile_1d;
-    hardswish_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_univector_contiguous;
-    hardswish_op->compute.range[0] = batch_size * channels * sizeof(float);
-    hardswish_op->compute.tile[0] = block_size;
-  } else {
-    hardswish_op->context.univector_strided = (struct univector_strided_context) {
-      .n = channels * sizeof(float),
-      .x = input,
-      .x_stride = input_stride * sizeof(float),
-      .y = output,
-      .y_stride = output_stride * sizeof(float),
-      .ukernel = xnn_params.f32.hswish,
-      .params.f32_hswish = hardswish_op->f32_hswish_params,
-    };
-    hardswish_op->compute.type = xnn_parallelization_type_1d_tile_1d;
-    hardswish_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_univector_strided;
-    hardswish_op->compute.range[0] = batch_size;
-    hardswish_op->compute.tile[0] = 1;
-  }
-  hardswish_op->state = xnn_run_state_ready;
-
-  return xnn_status_success;
-}
diff --git a/src/operators/max-pooling-nhwc.c b/src/operators/max-pooling-nhwc.c
index 0990a20..341c1e6 100644
--- a/src/operators/max-pooling-nhwc.c
+++ b/src/operators/max-pooling-nhwc.c
@@ -169,7 +169,7 @@
   max_pooling_op->input_pixel_stride = input_pixel_stride;
   max_pooling_op->output_pixel_stride = output_pixel_stride;
 
-  max_pooling_op->u8_minmax_params = xnn_init_u8_minmax_params(output_min, output_max);
+  max_pooling_op->params.u8_minmax = xnn_init_u8_minmax_params(output_min, output_max);
 
   max_pooling_op->type = xnn_operator_type_max_pooling_nhwc_u8;
   max_pooling_op->ukernel.type = xnn_ukernel_type_max_pooling;
@@ -327,7 +327,7 @@
   max_pooling_op->input_pixel_stride = input_pixel_stride;
   max_pooling_op->output_pixel_stride = output_pixel_stride;
 
-  max_pooling_op->f32_minmax_params = xnn_init_f32_minmax_params(output_min, output_max);
+  max_pooling_op->params.f32_minmax = xnn_init_f32_minmax_params(output_min, output_max);
 
   max_pooling_op->type = xnn_operator_type_max_pooling_nhwc_f32;
   max_pooling_op->ukernel.type = xnn_ukernel_type_max_pooling;
@@ -498,7 +498,7 @@
     0 /* log2(sizeof(input element)) = log2(sizeof(uint8_t)) */,
     0 /* log2(sizeof(output element)) = log2(sizeof(uint8_t)) */,
     &xnn_params.u8.maxpool,
-    &max_pooling_op->u8_minmax_params,
+    &max_pooling_op->params.u8_minmax,
     pthreadpool_get_threads_count(threadpool));
 }
 
@@ -525,7 +525,7 @@
     2 /* log2(sizeof(input element)) = log2(sizeof(float)) */,
     2 /* log2(sizeof(output element)) = log2(sizeof(float)) */,
     &xnn_params.f32.maxpool,
-    &max_pooling_op->f32_minmax_params,
+    &max_pooling_op->params.f32_minmax,
     pthreadpool_get_threads_count(threadpool));
 }
 
diff --git a/src/operators/sigmoid-nc.c b/src/operators/sigmoid-nc.c
index 8724153..4f95cca 100644
--- a/src/operators/sigmoid-nc.c
+++ b/src/operators/sigmoid-nc.c
@@ -153,74 +153,6 @@
   return status;
 }
 
-enum xnn_status xnn_create_sigmoid_nc_f32(
-    size_t channels,
-    size_t input_stride,
-    size_t output_stride,
-    uint32_t flags,
-    xnn_operator_t* sigmoid_op_out)
-{
-  xnn_operator_t sigmoid_op = NULL;
-  enum xnn_status status = xnn_status_uninitialized;
-
-  if (!xnn_params.initialized) {
-    xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(xnn_operator_type_sigmoid_nc_f32));
-    goto error;
-  }
-
-  status = xnn_status_invalid_parameter;
-
-  if (channels == 0) {
-    xnn_log_error(
-      "failed to create %s operator with %zu channels: number of channels must be non-zero",
-      xnn_operator_type_to_string(xnn_operator_type_sigmoid_nc_f32), channels);
-    goto error;
-  }
-
-  if (input_stride < channels) {
-    xnn_log_error(
-      "failed to create %s operator with input element stride of %zu: "
-      "stride must be at least as large as the number of channels (%zu)",
-      xnn_operator_type_to_string(xnn_operator_type_sigmoid_nc_f32), input_stride, channels);
-    goto error;
-  }
-
-  if (output_stride < channels) {
-    xnn_log_error(
-      "failed to create %s operator with output element stride of %zu: "
-      "stride must be at least as large as the number of channels (%zu)",
-      xnn_operator_type_to_string(xnn_operator_type_sigmoid_nc_f32), output_stride, channels);
-    goto error;
-  }
-
-  status = xnn_status_out_of_memory;
-
-  sigmoid_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
-  if (sigmoid_op == NULL) {
-    xnn_log_error(
-      "failed to allocate %zu bytes for %s operator descriptor",
-      sizeof(struct xnn_operator), xnn_operator_type_to_string(xnn_operator_type_sigmoid_nc_f32));
-    goto error;
-  }
-
-  sigmoid_op->channels = channels;
-  sigmoid_op->input_pixel_stride = input_stride;
-  sigmoid_op->output_pixel_stride = output_stride;
-
-  sigmoid_op->type = xnn_operator_type_sigmoid_nc_f32;
-  sigmoid_op->ukernel.type = xnn_ukernel_type_sigmoid;
-
-  sigmoid_op->state = xnn_run_state_invalid;
-
-  *sigmoid_op_out = sigmoid_op;
-  return xnn_status_success;
-
-error:
-  xnn_delete_operator(sigmoid_op);
-  return status;
-}
-
 enum xnn_status xnn_setup_sigmoid_nc_q8(
     xnn_operator_t sigmoid_op,
     size_t batch_size,
@@ -287,64 +219,3 @@
 
   return xnn_status_success;
 }
-
-enum xnn_status xnn_setup_sigmoid_nc_f32(
-    xnn_operator_t sigmoid_op,
-    size_t batch_size,
-    const float* input,
-    float* output,
-    pthreadpool_t threadpool)
-{
-  if (sigmoid_op->type != xnn_operator_type_sigmoid_nc_f32) {
-    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
-      xnn_operator_type_to_string(xnn_operator_type_sigmoid_nc_f32),
-      xnn_operator_type_to_string(sigmoid_op->type));
-    return xnn_status_invalid_parameter;
-  }
-  sigmoid_op->state = xnn_run_state_invalid;
-
-  if (!xnn_params.initialized) {
-    xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
-      xnn_operator_type_to_string(xnn_operator_type_sigmoid_nc_f32));
-    return xnn_status_uninitialized;
-  }
-
-  if (batch_size == 0) {
-    sigmoid_op->state = xnn_run_state_skip;
-    return xnn_status_success;
-  }
-
-  const size_t channels = sigmoid_op->channels;
-  const size_t input_stride = sigmoid_op->input_pixel_stride;
-  const size_t output_stride = sigmoid_op->output_pixel_stride;
-  if ((((input_stride ^ channels) | (output_stride ^ channels)) == 0) || batch_size == 1) {
-    const size_t block_size = 4096;
-    sigmoid_op->context.univector_contiguous = (struct univector_contiguous_context) {
-      .x = input,
-      .x_stride = input_stride * sizeof(float),
-      .y = output,
-      .y_stride = output_stride * sizeof(float),
-      .ukernel = xnn_params.f32.sigmoid,
-    };
-    sigmoid_op->compute.type = xnn_parallelization_type_1d_tile_1d;
-    sigmoid_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_univector_contiguous;
-    sigmoid_op->compute.range[0] = batch_size * channels * sizeof(float);
-    sigmoid_op->compute.tile[0] = block_size;
-  } else {
-    sigmoid_op->context.univector_strided = (struct univector_strided_context) {
-      .n = channels * sizeof(float),
-      .x = input,
-      .x_stride = input_stride * sizeof(float),
-      .y = output,
-      .y_stride = output_stride * sizeof(float),
-      .ukernel = xnn_params.f32.sigmoid,
-    };
-    sigmoid_op->compute.type = xnn_parallelization_type_1d_tile_1d;
-    sigmoid_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_univector_strided;
-    sigmoid_op->compute.range[0] = batch_size;
-    sigmoid_op->compute.tile[0] = 1;
-  }
-  sigmoid_op->state = xnn_run_state_ready;
-
-  return xnn_status_success;
-}
diff --git a/src/operators/unary-elementwise-nc.c b/src/operators/unary-elementwise-nc.c
new file mode 100644
index 0000000..9a41088
--- /dev/null
+++ b/src/operators/unary-elementwise-nc.c
@@ -0,0 +1,328 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <xnnpack.h>
+#include <xnnpack/allocator.h>
+#include <xnnpack/log.h>
+#include <xnnpack/operator.h>
+#include <xnnpack/params-init.h>
+#include <xnnpack/params.h>
+
+
+static enum xnn_status create_unary_elementwise_nc(
+    size_t channels,
+    size_t input_stride,
+    size_t output_stride,
+    uint32_t flags,
+    const void* params,
+    size_t params_size,
+    enum xnn_operator_type operator_type,
+    xnn_operator_t* unary_elementwise_op_out)
+{
+  xnn_operator_t unary_elementwise_op = NULL;
+
+  if (!xnn_params.initialized) {
+    xnn_log_error("failed to create %s operator: XNNPACK is not initialized",
+      xnn_operator_type_to_string(operator_type));
+    return xnn_status_uninitialized;
+  }
+
+  if (channels == 0) {
+    xnn_log_error(
+      "failed to create %s operator with %zu channels: number of channels must be non-zero",
+      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), channels);
+    return xnn_status_invalid_parameter;
+  }
+
+  if (input_stride < channels) {
+    xnn_log_error(
+      "failed to create %s operator with input element stride of %zu: "
+      "stride must be at least as large as the number of channels (%zu)",
+      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), input_stride, channels);
+    return xnn_status_invalid_parameter;
+  }
+
+  if (output_stride < channels) {
+    xnn_log_error(
+      "failed to create %s operator with output element stride of %zu: "
+      "stride must be at least as large as the number of channels (%zu)",
+      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), output_stride, channels);
+    return xnn_status_invalid_parameter;
+  }
+
+  unary_elementwise_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
+  if (unary_elementwise_op == NULL) {
+    xnn_log_error(
+      "failed to allocate %zu bytes for %s operator descriptor",
+      sizeof(struct xnn_operator), xnn_operator_type_to_string(operator_type));
+    return xnn_status_out_of_memory;
+  }
+
+  unary_elementwise_op->channels = channels;
+  unary_elementwise_op->input_pixel_stride = input_stride;
+  unary_elementwise_op->output_pixel_stride = output_stride;
+  if (params_size != 0) {
+    memcpy(&unary_elementwise_op->params, params, params_size);
+  }
+
+  unary_elementwise_op->type = operator_type;
+  unary_elementwise_op->ukernel.type = xnn_ukernel_type_unary_elementwise;
+
+  unary_elementwise_op->state = xnn_run_state_invalid;
+
+  *unary_elementwise_op_out = unary_elementwise_op;
+  return xnn_status_success;
+}
+
+static enum xnn_status setup_unary_elementwise_nc(
+    xnn_operator_t unary_elementwise_op,
+    size_t batch_size,
+    const void* input,
+    void* output,
+    xnn_univector_ukernel_function ukernel,
+    uint32_t log2_element_size,
+    const void* params,
+    size_t params_size)
+{
+  if (!xnn_params.initialized) {
+    xnn_log_error("failed to setup %s operator: XNNPACK is not initialized",
+      xnn_operator_type_to_string(unary_elementwise_op->type));
+    return xnn_status_uninitialized;
+  }
+
+  if (batch_size == 0) {
+    unary_elementwise_op->state = xnn_run_state_skip;
+    return xnn_status_success;
+  }
+
+  const size_t channels = unary_elementwise_op->channels;
+  const size_t input_stride = unary_elementwise_op->input_pixel_stride;
+  const size_t output_stride = unary_elementwise_op->output_pixel_stride;
+  if ((((input_stride ^ channels) | (output_stride ^ channels)) == 0) || batch_size == 1) {
+    const size_t block_size = 4096;
+    unary_elementwise_op->context.univector_contiguous = (struct univector_contiguous_context) {
+      .x = input,
+      .x_stride = input_stride << log2_element_size,
+      .y = output,
+      .y_stride = output_stride << log2_element_size,
+      .ukernel = ukernel,
+    };
+    if (params_size != 0) {
+      memcpy(&unary_elementwise_op->context.univector_contiguous.params, params, params_size);
+    }
+    unary_elementwise_op->compute.type = xnn_parallelization_type_1d_tile_1d;
+    unary_elementwise_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_univector_contiguous;
+    unary_elementwise_op->compute.range[0] = (batch_size * channels) << log2_element_size;
+    unary_elementwise_op->compute.tile[0] = block_size;
+  } else {
+    unary_elementwise_op->context.univector_strided = (struct univector_strided_context) {
+      .n = channels << log2_element_size,
+      .x = input,
+      .x_stride = input_stride << log2_element_size,
+      .y = output,
+      .y_stride = output_stride << log2_element_size,
+      .ukernel = ukernel,
+    };
+    if (params_size != 0) {
+      memcpy(&unary_elementwise_op->context.univector_strided.params, params, params_size);
+    }
+    unary_elementwise_op->compute.type = xnn_parallelization_type_1d_tile_1d;
+    unary_elementwise_op->compute.task_1d_tile_1d = (pthreadpool_task_1d_tile_1d_t) xnn_compute_univector_strided;
+    unary_elementwise_op->compute.range[0] = batch_size;
+    unary_elementwise_op->compute.tile[0] = 1;
+  }
+  unary_elementwise_op->state = xnn_run_state_ready;
+
+  return xnn_status_success;
+}
+
+enum xnn_status xnn_create_clamp_nc_u8(
+    size_t channels,
+    size_t input_stride,
+    size_t output_stride,
+    uint8_t output_min,
+    uint8_t output_max,
+    uint32_t flags,
+    xnn_operator_t* clamp_op_out)
+{
+  if (output_min >= output_max) {
+    xnn_log_error(
+      "failed to create %s operator with [%" PRIu8 ", %" PRIu8 "] output range: range min must be below range max",
+      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_u8), output_min, output_max);
+    return xnn_status_invalid_parameter;
+  }
+
+  const union xnn_u8_minmax_params params = xnn_init_u8_minmax_params(output_min, output_max);
+  return create_unary_elementwise_nc(
+    channels, input_stride, output_stride, flags,
+    &params, sizeof(params),
+    xnn_operator_type_clamp_nc_u8,
+    clamp_op_out);
+}
+
+enum xnn_status xnn_create_clamp_nc_f32(
+    size_t channels,
+    size_t input_stride,
+    size_t output_stride,
+    float output_min,
+    float output_max,
+    uint32_t flags,
+    xnn_operator_t* clamp_op_out)
+{
+  if (isnan(output_min)) {
+    xnn_log_error(
+      "failed to create %s operator with NaN output lower bound: lower bound must be non-NaN",
+      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32));
+    return xnn_status_invalid_parameter;
+  }
+
+  if (isnan(output_max)) {
+    xnn_log_error(
+      "failed to create %s operator with NaN output upper bound: upper bound must be non-NaN",
+      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32));
+    return xnn_status_invalid_parameter;
+  }
+
+  if (output_min >= output_max) {
+    xnn_log_error(
+      "failed to create %s operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
+      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32), output_min, output_max);
+    return xnn_status_invalid_parameter;
+  }
+
+  const union xnn_f32_minmax_params params = xnn_init_f32_minmax_params(output_min, output_max);
+  return create_unary_elementwise_nc(
+    channels, input_stride, output_stride, flags,
+    &params, sizeof(params),
+    xnn_operator_type_clamp_nc_f32,
+    clamp_op_out);
+}
+
+enum xnn_status xnn_create_hardswish_nc_f32(
+    size_t channels,
+    size_t input_stride,
+    size_t output_stride,
+    uint32_t flags,
+    xnn_operator_t* hardswish_op_out)
+{
+  const union xnn_f32_hswish_params params = xnn_init_f32_hswish_params();
+  return create_unary_elementwise_nc(
+    channels, input_stride, output_stride, flags,
+    &params, sizeof(params),
+    xnn_operator_type_hardswish_nc_f32,
+    hardswish_op_out);
+}
+
+enum xnn_status xnn_create_sigmoid_nc_f32(
+    size_t channels,
+    size_t input_stride,
+    size_t output_stride,
+    uint32_t flags,
+    xnn_operator_t* sigmoid_op_out)
+{
+  return create_unary_elementwise_nc(
+    channels, input_stride, output_stride, flags,
+    NULL, 0,
+    xnn_operator_type_sigmoid_nc_f32,
+    sigmoid_op_out);
+}
+
+enum xnn_status xnn_setup_clamp_nc_u8(
+    xnn_operator_t clamp_op,
+    size_t batch_size,
+    const uint8_t* input,
+    uint8_t* output,
+    pthreadpool_t threadpool)
+{
+  if (clamp_op->type != xnn_operator_type_clamp_nc_u8) {
+    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
+      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_u8),
+      xnn_operator_type_to_string(clamp_op->type));
+    return xnn_status_invalid_parameter;
+  }
+  clamp_op->state = xnn_run_state_invalid;
+
+  return setup_unary_elementwise_nc(
+    clamp_op,
+    batch_size, input, output,
+    xnn_params.u8.clamp,
+    0 /* log2(sizeof(uint8_t)) */,
+    &clamp_op->params.u8_minmax, sizeof(clamp_op->params.u8_minmax));
+}
+
+enum xnn_status xnn_setup_clamp_nc_f32(
+    xnn_operator_t clamp_op,
+    size_t batch_size,
+    const float* input,
+    float* output,
+    pthreadpool_t threadpool)
+{
+  if (clamp_op->type != xnn_operator_type_clamp_nc_f32) {
+    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
+      xnn_operator_type_to_string(xnn_operator_type_clamp_nc_f32),
+      xnn_operator_type_to_string(clamp_op->type));
+    return xnn_status_invalid_parameter;
+  }
+  clamp_op->state = xnn_run_state_invalid;
+
+  return setup_unary_elementwise_nc(
+    clamp_op,
+    batch_size, input, output,
+    xnn_params.f32.clamp,
+    2 /* log2(sizeof(float)) */,
+    &clamp_op->params.f32_minmax, sizeof(clamp_op->params.f32_minmax));
+}
+
+enum xnn_status xnn_setup_hardswish_nc_f32(
+    xnn_operator_t hardswish_op,
+    size_t batch_size,
+    const float* input,
+    float* output,
+    pthreadpool_t threadpool)
+{
+  if (hardswish_op->type != xnn_operator_type_hardswish_nc_f32) {
+    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
+      xnn_operator_type_to_string(xnn_operator_type_hardswish_nc_f32),
+      xnn_operator_type_to_string(hardswish_op->type));
+    return xnn_status_invalid_parameter;
+  }
+  hardswish_op->state = xnn_run_state_invalid;
+
+  return setup_unary_elementwise_nc(
+    hardswish_op,
+    batch_size, input, output,
+    xnn_params.f32.hswish,
+    2 /* log2(sizeof(float)) */,
+    &hardswish_op->params.f32_hswish, sizeof(hardswish_op->params.f32_hswish));
+}
+
+enum xnn_status xnn_setup_sigmoid_nc_f32(
+    xnn_operator_t sigmoid_op,
+    size_t batch_size,
+    const float* input,
+    float* output,
+    pthreadpool_t threadpool)
+{
+  if (sigmoid_op->type != xnn_operator_type_sigmoid_nc_f32) {
+    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
+      xnn_operator_type_to_string(xnn_operator_type_sigmoid_nc_f32),
+      xnn_operator_type_to_string(sigmoid_op->type));
+    return xnn_status_invalid_parameter;
+  }
+  sigmoid_op->state = xnn_run_state_invalid;
+
+  return setup_unary_elementwise_nc(
+    sigmoid_op,
+    batch_size, input, output,
+    xnn_params.f32.sigmoid,
+    2 /* log2(sizeof(float)) */,
+    NULL, 0);
+}
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index cd39d62..4c20e8b 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -24,22 +24,20 @@
   xnn_ukernel_type_average_pooling,
   xnn_ukernel_type_binary_elementwise,
   xnn_ukernel_type_channel_shuffle,
-  xnn_ukernel_type_clamp,
   xnn_ukernel_type_conv2d_hwc2chw,
   xnn_ukernel_type_dwconv,
   xnn_ukernel_type_gemm,
   xnn_ukernel_type_global_average_pooling,
-  xnn_ukernel_type_hswish,
   xnn_ukernel_type_igemm,
   xnn_ukernel_type_lut,
   xnn_ukernel_type_max_pooling,
   xnn_ukernel_type_pad,
   xnn_ukernel_type_pixelwise_average_pooling,
   xnn_ukernel_type_prelu,
-  xnn_ukernel_type_sigmoid,
   xnn_ukernel_type_softmax,
   xnn_ukernel_type_spmm,
   xnn_ukernel_type_subconv2d,
+  xnn_ukernel_type_unary_elementwise,
   xnn_ukernel_type_unpooling,
   xnn_ukernel_type_vmulcaddc,
 };
@@ -242,25 +240,25 @@
 
   union {
     // Parameters for Global Average Pooling in CHW layout
-    union xnn_f32_gavgpool_params f32_gavgpool_params;
-    union xnn_f32_hswish_params f32_hswish_params;
+    union xnn_f32_gavgpool_params f32_gavgpool;
+    union xnn_f32_hswish_params f32_hswish;
     // Pixelwise Average Pooling normally use f32_minmax_params, but also initialize
     // f32_scaleminmax_params in case it needs to switch to Global Average Pooling operation.
     struct {
-      union xnn_f32_scaleminmax_params f32_scaleminmax_params;
-      union xnn_f32_minmax_params f32_minmax_params;
+      union xnn_f32_minmax_params f32_minmax;
+      union xnn_f32_scaleminmax_params f32_scaleminmax;
     };
-    union xnn_f32_chw_params f32_chw_params;
-    union xnn_q8_add_params q8_add_params;
-    union xnn_q8_gemm_params q8_gemm_params;
+    union xnn_f32_chw_params f32_chw;
+    union xnn_q8_add_params q8_add;
+    union xnn_q8_gemm_params q8_gemm;
     // Average Pooling normally use q8_avgpool_params, but also initialize q8_gavgpool_params in case it needs to switch
     // to Global Average Pooling operation.
     struct {
-      union xnn_q8_avgpool_params q8_avgpool_params;
-      union xnn_q8_avgpool_params q8_gavgpool_params;
+      union xnn_q8_avgpool_params q8_avgpool;
+      union xnn_q8_avgpool_params q8_gavgpool;
     };
-    union xnn_u8_minmax_params u8_minmax_params;
-  };
+    union xnn_u8_minmax_params u8_minmax;
+  } params;
   enum xnn_operator_type type;
   struct xnn_ukernel ukernel;