ND Divide operator with broadcasting support

PiperOrigin-RevId: 284269401
diff --git a/src/binary-elementwise-nd.c b/src/binary-elementwise-nd.c
index 959bce7..c3161b6 100644
--- a/src/binary-elementwise-nd.c
+++ b/src/binary-elementwise-nd.c
@@ -28,7 +28,7 @@
   enum xnn_status status = xnn_status_uninitialized;
 
   if (!xnn_params.initialized) {
-    xnn_log_error("failed to create Add/Subtract/Multiply operator: XNNPACK is not initialized");
+    xnn_log_error("failed to create Add/Subtract/Multiply/Divide/Minimum/Maximum operator: XNNPACK is not initialized");
     goto error;
   }
 
@@ -36,19 +36,19 @@
 
   if (isnan(output_min)) {
     xnn_log_error(
-      "failed to create Add/Subtract/Multiply operator with NaN output lower bound: lower bound must be non-NaN");
+      "failed to create Add/Subtract/Multiply/Divide/Minimum/Maximum operator with NaN output lower bound: lower bound must be non-NaN");
     goto error;
   }
 
   if (isnan(output_max)) {
     xnn_log_error(
-      "failed to create Add/Subtract/Multiply operator with NaN output upper bound: upper bound must be non-NaN");
+      "failed to create Add/Subtract/Multiply/Divide/Minimum/Maximum operator with NaN output upper bound: upper bound must be non-NaN");
     goto error;
   }
 
   if (output_min >= output_max) {
     xnn_log_error(
-      "failed to create Add/Subtract/Multiply operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
+      "failed to create Add/Subtract/Multiply/Divide/Minimum/Maximum operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
       output_min, output_max);
     goto error;
   }
@@ -57,7 +57,7 @@
 
   binary_elementwise_op = xnn_allocate_zero_simd_memory(sizeof(struct xnn_operator));
   if (binary_elementwise_op == NULL) {
-    xnn_log_error("failed to allocate %zu bytes for Add/Subtract/Multiply operator descriptor", sizeof(struct xnn_operator));
+    xnn_log_error("failed to allocate %zu bytes for Add/Subtract/Multiply/Divide/Minimum/Maximum operator descriptor", sizeof(struct xnn_operator));
     goto error;
   }
 
@@ -86,6 +86,16 @@
     output_min, output_max, flags, xnn_operator_type_add_nd_f32, add_op_out);
 }
 
+enum xnn_status xnn_create_divide_nd_f32(
+    float output_min,
+    float output_max,
+    uint32_t flags,
+    xnn_operator_t* divide_op_out)
+{
+  return create_binary_elementwise_nd_f32(
+    output_min, output_max, flags, xnn_operator_type_divide_nd_f32, divide_op_out);
+}
+
 enum xnn_status xnn_create_maximum_nd_f32(
     uint32_t flags,
     xnn_operator_t* maximum_op_out)
@@ -138,19 +148,19 @@
     size_t num_threads)
 {
   if (binary_elementwise_op->type != expected_operator_type) {
-    xnn_log_error("failed to setup Add/Subtract/Multiply (ND, F32) operator: operator type mismatch");
+    xnn_log_error("failed to setup Add/Subtract/Multiply/Divide/Minimum/Maximum (ND, F32) operator: operator type mismatch");
     return xnn_status_invalid_parameter;
   }
   binary_elementwise_op->state = xnn_run_state_invalid;
 
   if (!xnn_params.initialized) {
-    xnn_log_error("failed to setup Add/Subtract/Multiply operator: XNNPACK is not initialized");
+    xnn_log_error("failed to setup Add/Subtract/Multiply/Divide/Minimum/Maximum operator: XNNPACK is not initialized");
     return xnn_status_uninitialized;
   }
 
   if (max(num_input1_dims, num_input2_dims) > XNN_MAX_TENSOR_DIMS) {
     xnn_log_error(
-      "failed to setup Add/Subtract/Multiply operator with %zu and %zu dimensions in input shapes: "
+      "failed to setup Add/Subtract/Multiply/Divide/Minimum/Maximum operator with %zu and %zu dimensions in input shapes: "
       "the number of input dimensions must not exceed %d",
       num_input1_dims, num_input2_dims, XNN_MAX_TENSOR_DIMS);
     return xnn_status_unsupported_parameter;
@@ -158,14 +168,14 @@
 
   for (size_t i = 0; i < num_input1_dims; i++) {
     if (input1_shape[i] == 0) {
-      xnn_log_error("failed to setup Add/Subtract/Multiply operator: shape dimension #%zu of input #1 is zero", i);
+      xnn_log_error("failed to setup Add/Subtract/Multiply/Divide/Minimum/Maximum operator: shape dimension #%zu of input #1 is zero", i);
       return xnn_status_invalid_parameter;
     }
   }
 
   for (size_t i = 0; i < num_input2_dims; i++) {
     if (input2_shape[i] == 0) {
-      xnn_log_error("failed to setup Add/Subtract/Multiply operator: shape dimension #%zu of input #2 is zero", i);
+      xnn_log_error("failed to setup Add/Subtract/Multiply/Divide/Minimum/Maximum operator: shape dimension #%zu of input #2 is zero", i);
       return xnn_status_invalid_parameter;
     }
   }
@@ -217,7 +227,7 @@
       compressed_input2_shape[num_compressed_dims - 1] *= input1_dim;
       compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
     } else {
-      xnn_log_error("failed to setup Add/Subtract/Multiply operator: "
+      xnn_log_error("failed to setup Add/Subtract/Multiply/Divide/Minimum/Maximum operator: "
         "shape dimension #%zu of input1 (%zu) does not match shape dimension #%zu of input2 (%zu)",
         num_input1_dims - i, input1_dim, num_input2_dims - i, input2_dim);
       return xnn_status_invalid_parameter;
@@ -313,6 +323,26 @@
     pthreadpool_get_threads_count(threadpool));
 }
 
+enum xnn_status xnn_setup_divide_nd_f32(
+    xnn_operator_t divide_op,
+    size_t num_input1_dims,
+    const size_t* input1_shape,
+    size_t num_input2_dims,
+    const size_t* input2_shape,
+    const float* input1,
+    const float* input2,
+    float* output,
+    pthreadpool_t threadpool)
+{
+  return setup_binary_elementwise_nd_f32(
+    divide_op, xnn_operator_type_divide_nd_f32,
+    num_input1_dims, input1_shape,
+    num_input2_dims, input2_shape,
+    input1, input2, output,
+    &xnn_params.f32.vdiv,
+    pthreadpool_get_threads_count(threadpool));
+}
+
 enum xnn_status xnn_setup_maximum_nd_f32(
     xnn_operator_t maximum_op,
     size_t num_input1_dims,
diff --git a/src/init.c b/src/init.c
index 36bbbed..c0f17bf 100644
--- a/src/init.c
+++ b/src/init.c
@@ -224,6 +224,12 @@
       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__neon_x8,
       .element_tile = 8,
     };
+    xnn_params.f32.vdiv = (struct vbinary_parameters) {
+      .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__scalar_x2,
+      .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__scalar_x2,
+      .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__scalar_x2,
+      .element_tile = 2,
+    };
     xnn_params.f32.vmax = (struct vbinary_parameters) {
       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
@@ -522,6 +528,12 @@
       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__neon_x8,
       .element_tile = 8,
     };
+    xnn_params.f32.vdiv = (struct vbinary_parameters) {
+      .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__neon_x8,
+      .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__neon_x8,
+      .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__neon_x8,
+      .element_tile = 8,
+    };
     xnn_params.f32.vmax = (struct vbinary_parameters) {
       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__neon_x8,
       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__neon_x8,
@@ -848,6 +860,12 @@
       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__sse_x8,
       .element_tile = 8,
     };
+    xnn_params.f32.vdiv = (struct vbinary_parameters) {
+      .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__sse_x8,
+      .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__sse_x8,
+      .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__sse_x8,
+      .element_tile = 8,
+    };
     xnn_params.f32.vmax = (struct vbinary_parameters) {
       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__sse_x8,
       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__sse_x8,
@@ -1072,6 +1090,12 @@
       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__psimd_x8,
       .element_tile = 8,
     };
+    xnn_params.f32.vdiv = (struct vbinary_parameters) {
+      .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__psimd_x4,
+      .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__psimd_x4,
+      .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__psimd_x4,
+      .element_tile = 4,
+    };
     xnn_params.f32.vmax = (struct vbinary_parameters) {
       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__psimd_x8,
       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__psimd_x8,
@@ -1271,6 +1295,12 @@
       .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasm_x4,
       .element_tile = 8,
     };
+    xnn_params.f32.vdiv = (struct vbinary_parameters) {
+      .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_ukernel__wasm_x2,
+      .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdivc_ukernel__wasm_x2,
+      .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrdivc_ukernel__wasm_x2,
+      .element_tile = 2,
+    };
     xnn_params.f32.vmax = (struct vbinary_parameters) {
       .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x4,
       .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x4,
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index fed1165..9ff2564 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -62,6 +62,7 @@
   xnn_operator_type_convolution_nchw_f32,
   xnn_operator_type_deconvolution_nhwc_f32,
   xnn_operator_type_deconvolution_nhwc_q8,
+  xnn_operator_type_divide_nd_f32,
   xnn_operator_type_fully_connected_nc_f32,
   xnn_operator_type_fully_connected_nc_q8,
   xnn_operator_type_global_average_pooling_nwc_f32,
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index a047d62..8de5adc 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -1363,6 +1363,7 @@
     xnn_univector_ukernel_function sigmoid;
     struct prelu_parameters prelu;
     struct vbinary_parameters vadd;
+    struct vbinary_parameters vdiv;
     struct vbinary_parameters vmax;
     struct vbinary_parameters vmin;
     struct vbinary_parameters vmul;