Copy NC operator

PiperOrigin-RevId: 314845678
diff --git a/BUILD.bazel b/BUILD.bazel
index 5343e32..5073163 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -4226,6 +4226,15 @@
 )
 
 xnnpack_unit_test(
+    name = "copy_nc_test",
+    srcs = [
+        "test/copy-nc.cc",
+        "test/copy-operator-tester.h",
+    ],
+    deps = OPERATOR_TEST_DEPS,
+)
+
+xnnpack_unit_test(
     name = "deconvolution_nhwc_test",
     srcs = [
         "test/deconvolution-nhwc.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index de63c9a..60b120c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2061,6 +2061,15 @@
   TARGET_LINK_LIBRARIES(convolution-nchw-test PRIVATE XNNPACK gtest gtest_main)
   ADD_TEST(convolution-nchw-test convolution-nchw-test)
 
+  ADD_EXECUTABLE(copy-nc-test test/copy-nc.cc)
+  SET_TARGET_PROPERTIES(copy-nc-test PROPERTIES
+    CXX_STANDARD 11
+    CXX_STANDARD_REQUIRED YES
+    CXX_EXTENSIONS NO)
+  TARGET_INCLUDE_DIRECTORIES(copy-nc-test PRIVATE src test)
+  TARGET_LINK_LIBRARIES(copy-nc-test PRIVATE XNNPACK gtest gtest_main)
+  ADD_TEST(copy-nc-test copy-nc-test)
+
   ADD_EXECUTABLE(deconvolution-nhwc-test test/deconvolution-nhwc.cc)
   SET_TARGET_PROPERTIES(deconvolution-nhwc-test PROPERTIES
     CXX_STANDARD 11
diff --git a/README.md b/README.md
index 2c17f75..e68d13f 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@
 - Channel Shuffle
 - Fully Connected
 - Clamp (includes ReLU and ReLU6)
+- Copy
 - HardSwish
 - Sigmoid
 - Softmax
diff --git a/include/xnnpack.h b/include/xnnpack.h
index a82d81c..aa17882 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -1261,6 +1261,20 @@
   void* output,
   pthreadpool_t threadpool);
 
+enum xnn_status xnn_create_copy_nc_x32(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint32_t flags,
+  xnn_operator_t* copy_op_out);
+
+enum xnn_status xnn_setup_copy_nc_x32(
+  xnn_operator_t copy_op,
+  size_t batch_size,
+  const void* input,
+  void* output,
+  pthreadpool_t threadpool);
+
 enum xnn_status xnn_create_unpooling2d_nhwc_x32(
   uint32_t input_padding_top,
   uint32_t input_padding_right,
diff --git a/src/operator-strings.c b/src/operator-strings.c
index 358af55..31300c6 100644
--- a/src/operator-strings.c
+++ b/src/operator-strings.c
@@ -48,6 +48,8 @@
       return "Convolution (NHWC, Q8)";
     case xnn_operator_type_convolution_nchw_f32:
       return "Convolution (NCHW, F32)";
+    case xnn_operator_type_copy_nc_x32:
+      return "Copy (NC, X32)";
     case xnn_operator_type_deconvolution_nhwc_f32:
       return "Deconvolution (NHWC, F32)";
     case xnn_operator_type_deconvolution_nhwc_q8:
diff --git a/src/operators/unary-elementwise-nc.c b/src/operators/unary-elementwise-nc.c
index 9a41088..9521bbf 100644
--- a/src/operators/unary-elementwise-nc.c
+++ b/src/operators/unary-elementwise-nc.c
@@ -206,6 +206,20 @@
     clamp_op_out);
 }
 
+enum xnn_status xnn_create_copy_nc_x32(
+    size_t channels,
+    size_t input_stride,
+    size_t output_stride,
+    uint32_t flags,
+    xnn_operator_t* copy_op_out)
+{
+  return create_unary_elementwise_nc(
+    channels, input_stride, output_stride, flags,
+    NULL, 0,
+    xnn_operator_type_copy_nc_x32,
+    copy_op_out);
+}
+
 enum xnn_status xnn_create_hardswish_nc_f32(
     size_t channels,
     size_t input_stride,
@@ -281,6 +295,33 @@
     &clamp_op->params.f32_minmax, sizeof(clamp_op->params.f32_minmax));
 }
 
+static void memcpy_ukernel(size_t size, const void* input, void* output, const void* params) {
+  memcpy(output, input, size);
+}
+
+enum xnn_status xnn_setup_copy_nc_x32(
+    xnn_operator_t copy_op,
+    size_t batch_size,
+    const void* input,
+    void* output,
+    pthreadpool_t threadpool)
+{
+  if (copy_op->type != xnn_operator_type_copy_nc_x32) {
+    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
+      xnn_operator_type_to_string(xnn_operator_type_copy_nc_x32),
+      xnn_operator_type_to_string(copy_op->type));
+    return xnn_status_invalid_parameter;
+  }
+  copy_op->state = xnn_run_state_invalid;
+
+  return setup_unary_elementwise_nc(
+    copy_op,
+    batch_size, input, output,
+    memcpy_ukernel,
+    2 /* log2(sizeof(uint32_t)) */,
+    NULL, 0);
+}
+
 enum xnn_status xnn_setup_hardswish_nc_f32(
     xnn_operator_t hardswish_op,
     size_t batch_size,
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index 4c20e8b..c876fd0 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -56,9 +56,10 @@
   xnn_operator_type_clamp_nc_f32,
   xnn_operator_type_clamp_nc_u8,
   xnn_operator_type_constant_pad_nd_x32,
+  xnn_operator_type_convolution_nchw_f32,
   xnn_operator_type_convolution_nhwc_f32,
   xnn_operator_type_convolution_nhwc_q8,
-  xnn_operator_type_convolution_nchw_f32,
+  xnn_operator_type_copy_nc_x32,
   xnn_operator_type_deconvolution_nhwc_f32,
   xnn_operator_type_deconvolution_nhwc_q8,
   xnn_operator_type_divide_nd_f32,
diff --git a/test/copy-nc.cc b/test/copy-nc.cc
new file mode 100644
index 0000000..0c70b1a
--- /dev/null
+++ b/test/copy-nc.cc
@@ -0,0 +1,63 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <gtest/gtest.h>
+
+#include "copy-operator-tester.h"
+
+
+TEST(CLAMP_NC_X32, unit_batch) {
+  for (size_t channels = 1; channels < 100; channels++) {
+    CopyOperatorTester()
+      .batch_size(1)
+      .channels(channels)
+      .iterations(3)
+      .TestX32();
+  }
+}
+
+TEST(CLAMP_NC_X32, small_batch) {
+  for (size_t channels = 1; channels < 100; channels++) {
+    CopyOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .iterations(3)
+      .TestX32();
+  }
+}
+
+TEST(CLAMP_NC_X32, small_batch_with_input_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    CopyOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .input_stride(129)
+      .iterations(3)
+      .TestX32();
+  }
+}
+
+TEST(CLAMP_NC_X32, small_batch_with_output_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    CopyOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .output_stride(117)
+      .iterations(3)
+      .TestX32();
+  }
+}
+
+TEST(CLAMP_NC_X32, small_batch_with_input_and_output_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    CopyOperatorTester()
+      .batch_size(3)
+      .channels(channels)
+      .input_stride(129)
+      .output_stride(117)
+      .iterations(3)
+      .TestX32();
+  }
+}
diff --git a/test/copy-operator-tester.h b/test/copy-operator-tester.h
new file mode 100644
index 0000000..aee4fe9
--- /dev/null
+++ b/test/copy-operator-tester.h
@@ -0,0 +1,142 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <functional>
+#include <limits>
+#include <random>
+#include <vector>
+
+#include <xnnpack.h>
+
+
+class CopyOperatorTester {
+ public:
+  inline CopyOperatorTester& channels(size_t channels) {
+    assert(channels != 0);
+    this->channels_ = channels;
+    return *this;
+  }
+
+  inline size_t channels() const {
+    return this->channels_;
+  }
+
+  inline CopyOperatorTester& input_stride(size_t input_stride) {
+    assert(input_stride != 0);
+    this->input_stride_ = input_stride;
+    return *this;
+  }
+
+  inline size_t input_stride() const {
+    if (this->input_stride_ == 0) {
+      return this->channels_;
+    } else {
+      assert(this->input_stride_ >= this->channels_);
+      return this->input_stride_;
+    }
+  }
+
+  inline CopyOperatorTester& output_stride(size_t output_stride) {
+    assert(output_stride != 0);
+    this->output_stride_ = output_stride;
+    return *this;
+  }
+
+  inline size_t output_stride() const {
+    if (this->output_stride_ == 0) {
+      return this->channels_;
+    } else {
+      assert(this->output_stride_ >= this->channels_);
+      return this->output_stride_;
+    }
+  }
+
+  inline CopyOperatorTester& batch_size(size_t batch_size) {
+    assert(batch_size != 0);
+    this->batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline size_t batch_size() const {
+    return this->batch_size_;
+  }
+
+  inline CopyOperatorTester& iterations(size_t iterations) {
+    this->iterations_ = iterations;
+    return *this;
+  }
+
+  inline size_t iterations() const {
+    return this->iterations_;
+  }
+
+  void TestX32() const {
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto u32rng = std::bind(std::uniform_int_distribution<uint32_t>(), rng);
+
+    std::vector<uint32_t> input(XNN_EXTRA_BYTES / sizeof(float) +
+      (batch_size() - 1) * input_stride() + channels());
+    std::vector<uint32_t> output((batch_size() - 1) * output_stride() + channels());
+    std::vector<uint32_t> output_ref(batch_size() * channels());
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(input.begin(), input.end(), std::ref(u32rng));
+      std::fill(output.begin(), output.end(), UINT32_C(0xDEADBEEF));
+
+      // Compute reference results.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          output_ref[i * channels() + c] = input[i * input_stride() + c];
+        }
+      }
+
+      // Create, setup, run, and destroy Copy operator.
+      ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+      xnn_operator_t copy_op = nullptr;
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_create_copy_nc_x32(
+          channels(), input_stride(), output_stride(),
+          0, &copy_op));
+      ASSERT_NE(nullptr, copy_op);
+
+      // Smart pointer to automatically delete copy_op.
+      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_copy_op(copy_op, xnn_delete_operator);
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_setup_copy_nc_x32(
+          copy_op,
+          batch_size(),
+          input.data(), output.data(),
+          nullptr /* thread pool */));
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_run_operator(copy_op, nullptr /* thread pool */));
+
+      // Verify results.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          ASSERT_EQ(output_ref[i * channels() + c], output[i * output_stride() + c])
+            << "at batch " << i << " / " << batch_size() << ", channel = " << c << " / " << channels();
+        }
+      }
+    }
+  }
+
+ private:
+  size_t batch_size_{1};
+  size_t channels_{1};
+  size_t input_stride_{0};
+  size_t output_stride_{0};
+  size_t iterations_{15};
+};