F32->F16 Convert operator

PiperOrigin-RevId: 409036598
diff --git a/BUILD.bazel b/BUILD.bazel
index 41d0ff6..f8e0a57 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -148,6 +148,8 @@
     "src/f32-dwconv2d-chw/gen/5x5p2-minmax-scalar-2x1-acc2.c",
     "src/f32-dwconv2d-chw/gen/5x5s2p2-minmax-scalar-1x1-acc5.c",
     "src/f32-dwconv2d-chw/gen/5x5s2p2-minmax-scalar-2x1-acc2.c",
+    "src/f32-f16-vcvt/gen/vcvt-scalar-bitcast-x4.c",
+    "src/f32-f16-vcvt/gen/vcvt-scalar-fabsf-x2.c",
     "src/f32-gavgpool-cw/scalar-x1.c",
     "src/f32-gavgpool/7p7x-minmax-scalar-c1.c",
     "src/f32-gavgpool/7x-minmax-scalar-c1.c",
@@ -2059,6 +2061,7 @@
     "src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-neon-1x4.c",
     "src/f32-dwconv2d-chw/gen/5x5p2-minmax-neon-1x4.c",
     "src/f32-dwconv2d-chw/gen/5x5s2p2-minmax-neon-1x4.c",
+    "src/f32-f16-vcvt/gen/vcvt-neon-x8.c",
     "src/f32-gavgpool-cw/neon-x4.c",
     "src/f32-gavgpool/7p7x-minmax-neon-c4.c",
     "src/f32-gavgpool/7x-minmax-neon-c4.c",
@@ -2858,6 +2861,7 @@
 
 PROD_NEONFP16_MICROKERNEL_SRCS = [
     "src/f16-f32-vcvt/gen/vcvt-neonfp16-x16.c",
+    "src/f32-f16-vcvt/gen/vcvt-neonfp16-x16.c",
 ]
 
 ALL_NEONFP16_MICROKERNEL_SRCS = [
@@ -3833,6 +3837,7 @@
     "src/f32-argmaxpool/4x-sse2-c4.c",
     "src/f32-argmaxpool/9p8x-sse2-c4.c",
     "src/f32-argmaxpool/9x-sse2-c4.c",
+    "src/f32-f16-vcvt/gen/vcvt-sse2-x16.c",
     "src/f32-prelu/gen/sse2-2x8.c",
     "src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c",
     "src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x12.c",
@@ -4232,6 +4237,7 @@
 
 PROD_SSE41_MICROKERNEL_SRCS = [
     "src/f16-f32-vcvt/gen/vcvt-sse41-int16-x16.c",
+    "src/f32-f16-vcvt/gen/vcvt-sse41-x8.c",
     "src/f32-prelu/gen/sse41-2x8.c",
     "src/f32-vlrelu/gen/vlrelu-sse41-x8.c",
     "src/f32-vrnd/gen/vrndd-sse41-x8.c",
@@ -4533,6 +4539,7 @@
     "src/f32-dwconv/gen/up16x3-minmax-avx.c",
     "src/f32-dwconv/gen/up16x4-minmax-avx.c",
     "src/f32-dwconv/gen/up16x9-minmax-avx.c",
+    "src/f32-f16-vcvt/gen/vcvt-avx-x24.c",
     "src/f32-gemm/gen/1x16-minmax-avx-broadcast.c",
     "src/f32-gemm/gen/5x16-minmax-avx-broadcast.c",
     "src/f32-igemm/gen/1x16-minmax-avx-broadcast.c",
@@ -4941,6 +4948,7 @@
 
 PROD_F16C_MICROKERNEL_SRCS = [
     "src/f16-f32-vcvt/gen/vcvt-f16c-x16.c",
+    "src/f32-f16-vcvt/gen/vcvt-f16c-x16.c",
 ]
 
 ALL_F16C_MICROKERNEL_SRCS = [
@@ -5792,6 +5800,7 @@
 
 PROD_AVX512SKX_MICROKERNEL_SRCS = [
     "src/f16-f32-vcvt/gen/vcvt-avx512skx-x16.c",
+    "src/f32-f16-vcvt/gen/vcvt-avx512skx-x16.c",
     "src/qc8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c",
     "src/qc8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c",
     "src/qc8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 10e810f..869892c 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -276,6 +276,8 @@
   src/f32-dwconv2d-chw/gen/5x5p2-minmax-scalar-2x1-acc2.c
   src/f32-dwconv2d-chw/gen/5x5s2p2-minmax-scalar-1x1-acc5.c
   src/f32-dwconv2d-chw/gen/5x5s2p2-minmax-scalar-2x1-acc2.c
+  src/f32-f16-vcvt/gen/vcvt-scalar-bitcast-x4.c
+  src/f32-f16-vcvt/gen/vcvt-scalar-fabsf-x2.c
   src/f32-gavgpool-cw/scalar-x1.c
   src/f32-gavgpool/7p7x-minmax-scalar-c1.c
   src/f32-gavgpool/7x-minmax-scalar-c1.c
@@ -1089,6 +1091,7 @@
   src/f32-dwconv2d-chw/gen/3x3s2p1-minmax-neon-1x4.c
   src/f32-dwconv2d-chw/gen/5x5p2-minmax-neon-1x4.c
   src/f32-dwconv2d-chw/gen/5x5s2p2-minmax-neon-1x4.c
+  src/f32-f16-vcvt/gen/vcvt-neon-x8.c
   src/f32-gavgpool-cw/neon-x4.c
   src/f32-gavgpool/7p7x-minmax-neon-c4.c
   src/f32-gavgpool/7x-minmax-neon-c4.c
@@ -1885,7 +1888,8 @@
   src/xx-pad/neon.c)
 
 SET(PROD_NEONFP16_MICROKERNEL_SRCS
-  src/f16-f32-vcvt/gen/vcvt-neonfp16-x16.c)
+  src/f16-f32-vcvt/gen/vcvt-neonfp16-x16.c
+  src/f32-f16-vcvt/gen/vcvt-neonfp16-x16.c)
 
 SET(ALL_NEONFP16_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/vcvt-neonfp16-x8.c
@@ -2847,6 +2851,7 @@
   src/f32-argmaxpool/4x-sse2-c4.c
   src/f32-argmaxpool/9p8x-sse2-c4.c
   src/f32-argmaxpool/9x-sse2-c4.c
+  src/f32-f16-vcvt/gen/vcvt-sse2-x16.c
   src/f32-prelu/gen/sse2-2x8.c
   src/f32-raddstoreexpminusmax/gen/sse2-p5-x20-acc2.c
   src/f32-velu/gen/velu-sse2-rr2-lut16-p3-x12.c
@@ -3242,6 +3247,7 @@
 
 SET(PROD_SSE41_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/vcvt-sse41-int16-x16.c
+  src/f32-f16-vcvt/gen/vcvt-sse41-x8.c
   src/f32-prelu/gen/sse41-2x8.c
   src/f32-vlrelu/gen/vlrelu-sse41-x8.c
   src/f32-vrnd/gen/vrndd-sse41-x8.c
@@ -3537,6 +3543,7 @@
 
 SET(PROD_AVX_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/vcvt-avx-int16-x16.c
+  src/f32-f16-vcvt/gen/vcvt-avx-x24.c
   src/f32-dwconv/gen/up8x25-minmax-avx.c
   src/f32-dwconv/gen/up16x3-minmax-avx.c
   src/f32-dwconv/gen/up16x4-minmax-avx.c
@@ -3944,7 +3951,8 @@
   src/x8-lut/gen/lut-avx-x64.c)
 
 SET(PROD_F16C_MICROKERNEL_SRCS
-  src/f16-f32-vcvt/gen/vcvt-f16c-x16.c)
+  src/f16-f32-vcvt/gen/vcvt-f16c-x16.c
+  src/f32-f16-vcvt/gen/vcvt-f16c-x16.c)
 
 SET(ALL_F16C_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/vcvt-f16c-x8.c
@@ -4788,6 +4796,7 @@
 
 SET(PROD_AVX512SKX_MICROKERNEL_SRCS
   src/f16-f32-vcvt/gen/vcvt-avx512skx-x16.c
+  src/f32-f16-vcvt/gen/vcvt-avx512skx-x16.c
   src/qc8-dwconv/gen/up32x9-minmax-fp32-avx512skx-mul32.c
   src/qc8-dwconv/gen/up32x25-minmax-fp32-avx512skx-mul32.c
   src/qc8-gemm/gen/1x16c8-minmax-fp32-avx512skx.c
diff --git a/include/xnnpack.h b/include/xnnpack.h
index 411e94e..2caf424 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -2741,6 +2741,20 @@
   float* output,
   pthreadpool_t threadpool);
 
+enum xnn_status xnn_create_convert_nc_f32_f16(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out);
+
+enum xnn_status xnn_setup_convert_nc_f32_f16(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  const float* input,
+  void* output,
+  pthreadpool_t threadpool);
+
 #endif  // XNN_NO_CVT_OPERATORS
 
 #ifdef __cplusplus
diff --git a/src/init.c b/src/init.c
index 1b562fc..bfa9d79 100644
--- a/src/init.c
+++ b/src/init.c
@@ -629,8 +629,10 @@
 
       if (cpuinfo_has_arm_neon_fp16()) {
         xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16;
+        xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16;
       } else {
         xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neon_int16_x16;
+        xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neon_x8;
       }
     #endif  // XNN_NO_VCVT_OPERATORS
 
@@ -1037,6 +1039,7 @@
       init_flags |= XNN_INIT_FLAG_VCVT;
 
       xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_float_x4;
+      xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_fabsf_x2;
     #endif  // XNN_NO_VCVT_OPERATORS
 
     /**************************** X32 micro-kernels ****************************/
@@ -2194,6 +2197,7 @@
     init_flags |= XNN_INIT_FLAG_VCVT;
 
     xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__neonfp16_x16;
+    xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__neonfp16_x16;
   #endif  // XNN_NO_VCVT_OPERATORS
 
   /**************************** X32 micro-kernels ****************************/
@@ -3331,14 +3335,19 @@
 
     if (cpuinfo_has_x86_avx512f() && cpuinfo_has_x86_avx512bw() && cpuinfo_has_x86_avx512dq() && cpuinfo_has_x86_avx512vl()) {
       xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx512skx_x16;
+      xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx512skx_x16;
     } else if (cpuinfo_has_x86_f16c()) {
       xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__f16c_x16;
+      xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__f16c_x16;
     } else if (cpuinfo_has_x86_avx()) {
       xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__avx_int16_x16;
+      xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__avx_x24;
     } else if (cpuinfo_has_x86_sse4_1()) {
       xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse41_int16_x16;
+      xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse41_x8;
     } else {
       xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__sse2_int16_x32;
+      xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__sse2_x16;
     }
   #endif  // XNN_NO_VCVT_OPERATORS
 
@@ -4020,6 +4029,7 @@
     init_flags |= XNN_INIT_FLAG_VCVT;
 
     xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__wasmsimd_int16_x16;
+    xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__wasmsimd_x24;
   #endif  // XNN_NO_VCVT_OPERATORS
 
   /**************************** X32 micro-kernels ****************************/
@@ -4517,6 +4527,7 @@
     init_flags |= XNN_INIT_FLAG_VCVT;
 
     xnn_params.vcvt.f16_to_f32 = (xnn_univector_ukernel_function) xnn_f16_f32_vcvt_ukernel__scalar_float_x1;
+    xnn_params.vcvt.f32_to_f16 = (xnn_univector_ukernel_function) xnn_f32_f16_vcvt_ukernel__scalar_bitcast_x4;
   #endif  // XNN_NO_VCVT_OPERATORS
 
   /**************************** X32 micro-kernels ****************************/
diff --git a/src/operator-strings.c b/src/operator-strings.c
index d4d6266..3db439f 100644
--- a/src/operator-strings.c
+++ b/src/operator-strings.c
@@ -54,6 +54,8 @@
       return "Constant Pad (ND, X32)";
     case xnn_operator_type_convert_nc_f16_f32:
       return "Convert (NC, F16, F32)";
+    case xnn_operator_type_convert_nc_f32_f16:
+      return "Convert (NC, F32, F16)";
     case xnn_operator_type_convolution_nhwc_f16:
       return "Convolution (NHWC, F16)";
     case xnn_operator_type_convolution_nhwc_f32:
diff --git a/src/operators/unary-elementwise-nc.c b/src/operators/unary-elementwise-nc.c
index 95aaebe..6f88725 100644
--- a/src/operators/unary-elementwise-nc.c
+++ b/src/operators/unary-elementwise-nc.c
@@ -317,6 +317,21 @@
     convert_op_out);
 }
 
+enum xnn_status xnn_create_convert_nc_f32_f16(
+  size_t channels,
+  size_t input_stride,
+  size_t output_stride,
+  uint32_t flags,
+  xnn_operator_t* convert_op_out)
+{
+  return create_unary_elementwise_nc(
+    channels, input_stride, output_stride, flags,
+    NULL, 0,
+    xnn_operator_type_convert_nc_f32_f16,
+    xnn_params.vcvt.f32_to_f16,
+    convert_op_out);
+}
+
 enum xnn_status xnn_create_copy_nc_x32(
     size_t channels,
     size_t input_stride,
@@ -688,6 +703,29 @@
     NULL, 0);
 }
 
+enum xnn_status xnn_setup_convert_nc_f32_f16(
+  xnn_operator_t convert_op,
+  size_t batch_size,
+  const float* input,
+  void* output,
+  pthreadpool_t threadpool)
+{
+  if (convert_op->type != xnn_operator_type_convert_nc_f32_f16) {
+    xnn_log_error("failed to setup operator: operator type mismatch (expected %s, got %s)",
+      xnn_operator_type_to_string(xnn_operator_type_convert_nc_f32_f16),
+      xnn_operator_type_to_string(convert_op->type));
+    return xnn_status_invalid_parameter;
+  }
+  convert_op->state = xnn_run_state_invalid;
+
+  return setup_unary_elementwise_nc(
+    convert_op,
+    batch_size, input, output,
+    2 /* log2(sizeof(float)) */,
+    1 /* log2(sizeof(uint16_t)) */,
+    NULL, 0);
+}
+
 enum xnn_status xnn_setup_copy_nc_x32(
     xnn_operator_t copy_op,
     size_t batch_size,
@@ -939,4 +977,4 @@
     2 /* log2(sizeof(float)) */,
     2 /* log2(sizeof(float)) */,
     &truncation_op->params.f32_rnd, sizeof(truncation_op->params.f32_rnd));
-}
\ No newline at end of file
+}
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index 4b96a06..bf23dfd 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -50,6 +50,7 @@
   xnn_operator_type_constant_pad_nd_x8,
   xnn_operator_type_constant_pad_nd_x32,
   xnn_operator_type_convert_nc_f16_f32,
+  xnn_operator_type_convert_nc_f32_f16,
   xnn_operator_type_convolution_nchw_f32,
   xnn_operator_type_convolution_nhwc_f16,
   xnn_operator_type_convolution_nhwc_f32,
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index 9f14b01..3e297c8 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -2717,6 +2717,7 @@
   } f32;
   struct {
     xnn_univector_ukernel_function f16_to_f32;
+    xnn_univector_ukernel_function f32_to_f16;
   } vcvt;
   struct {
     xnn_unpool_ukernel_function unpool;
diff --git a/test/convert-nc.cc b/test/convert-nc.cc
index aa9d417..7db85cd 100644
--- a/test/convert-nc.cc
+++ b/test/convert-nc.cc
@@ -61,3 +61,57 @@
       .TestF16toF32();
   }
 }
+
+TEST(CONVERT_NC_F32_F16, unit_batch) {
+  for (size_t channels = 1; channels < 100; channels++) {
+    ConvertOperatorTester()
+        .batch_size(1)
+        .channels(channels)
+        .iterations(3)
+        .TestF32toF16();
+  }
+}
+
+TEST(CONVERT_NC_F32_F16, small_batch) {
+  for (size_t channels = 1; channels < 100; channels++) {
+    ConvertOperatorTester()
+        .batch_size(3)
+        .channels(channels)
+        .iterations(3)
+        .TestF32toF16();
+  }
+}
+
+TEST(CONVERT_NC_F32_F16, small_batch_with_input_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    ConvertOperatorTester()
+        .batch_size(3)
+        .channels(channels)
+        .input_stride(129)
+        .iterations(3)
+        .TestF32toF16();
+  }
+}
+
+TEST(CONVERT_NC_F32_F16, small_batch_with_output_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    ConvertOperatorTester()
+        .batch_size(3)
+        .channels(channels)
+        .output_stride(117)
+        .iterations(3)
+        .TestF32toF16();
+  }
+}
+
+TEST(CONVERT_NC_F32_F16, small_batch_with_input_and_output_stride) {
+  for (size_t channels = 1; channels < 100; channels += 15) {
+    ConvertOperatorTester()
+        .batch_size(3)
+        .channels(channels)
+        .input_stride(129)
+        .output_stride(117)
+        .iterations(3)
+        .TestF32toF16();
+  }
+}
diff --git a/test/convert-operator-tester.h b/test/convert-operator-tester.h
index ff33e17..b2addd4 100644
--- a/test/convert-operator-tester.h
+++ b/test/convert-operator-tester.h
@@ -114,7 +114,7 @@
       ASSERT_NE(nullptr, convert_op);
 
       // Smart pointer to automatically delete convert op.
-      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_ceiling_op(convert_op, xnn_delete_operator);
+      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convert_op(convert_op, xnn_delete_operator);
 
       ASSERT_EQ(xnn_status_success,
         xnn_setup_convert_nc_f16_f32(
@@ -136,6 +136,59 @@
     }
   }
 
+  void TestF32toF16() const {
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto f32rng = std::bind(std::uniform_real_distribution<float>(-1.0f, 1.0f), rng);
+
+    std::vector<float> input(XNN_EXTRA_BYTES / sizeof(float) +
+      (batch_size() - 1) * input_stride() + channels());
+    std::vector<uint16_t> output((batch_size() - 1) * output_stride() + channels());
+    std::vector<uint16_t> output_ref(batch_size() * channels());
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(input.begin(), input.end(), std::ref(f32rng));
+      std::fill(output.begin(), output.end(), UINT16_C(0x7E));
+
+      // Compute reference results.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          output_ref[i * channels() + c] = fp16_ieee_from_fp32_value(input[i * input_stride() + c]);
+        }
+      }
+
+      // Create, setup, run, and destroy Convert operator.
+      ASSERT_EQ(xnn_status_success, xnn_initialize(nullptr /* allocator */));
+      xnn_operator_t convert_op = nullptr;
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_create_convert_nc_f32_f16(
+          channels(), input_stride(), output_stride(),
+          0, &convert_op));
+      ASSERT_NE(nullptr, convert_op);
+
+      // Smart pointer to automatically delete convert op.
+      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_convert_op(convert_op, xnn_delete_operator);
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_setup_convert_nc_f32_f16(
+          convert_op,
+          batch_size(),
+          input.data(), output.data(),
+          nullptr /* thread pool */));
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_run_operator(convert_op, nullptr /* thread pool */));
+
+      // Verify results.
+      for (size_t i = 0; i < batch_size(); i++) {
+        for (size_t c = 0; c < channels(); c++) {
+          ASSERT_EQ(output_ref[i * channels() + c], output[i * output_stride() + c])
+            << "at batch " << i << " / " << batch_size() << ", channel " << c << " / " << channels();
+        }
+      }
+    }
+  }
+
  private:
   size_t batch_size_{1};
   size_t channels_{1};