ND elementwise Multiply operator with broadcasting support

PiperOrigin-RevId: 280801113
diff --git a/BUILD.bazel b/BUILD.bazel
index 7420006..ba9477a 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -65,6 +65,7 @@
     "src/hardswish.c",
     "src/leaky-relu.c",
     "src/max-pooling.c",
+    "src/multiply.c",
     "src/prelu.c",
     "src/resize-bilinear.c",
     "src/sigmoid.c",
@@ -1927,6 +1928,15 @@
 )
 
 xnnpack_unit_test(
+    name = "multiply_test",
+    srcs = [
+        "test/multiply.cc",
+        "test/multiply-operator-tester.h",
+    ],
+    deps = OPERATOR_TEST_DEPS,
+)
+
+xnnpack_unit_test(
     name = "prelu_test",
     srcs = [
         "test/prelu.cc",
diff --git a/include/xnnpack.h b/include/xnnpack.h
index 887f05d..112b208 100644
--- a/include/xnnpack.h
+++ b/include/xnnpack.h
@@ -24,6 +24,9 @@
 /// Note: XNNPACK reads, but never writes beyond array bounds.
 #define XNN_EXTRA_BYTES 16
 
+/// Maximum number of dimensions in tensor shape.
+#define XNN_MAX_TENSOR_DIMS 4
+
 /// The convolution operator represents a depthwise convolution, and use HWGo layout for filters.
 #define XNN_FLAG_DEPTHWISE_CONVOLUTION 0x00000001
 
@@ -311,6 +314,23 @@
     float* output,
     pthreadpool_t threadpool);
 
+enum xnn_status xnn_create_multiply_nd_f32(
+    float output_min,
+    float output_max,
+    uint32_t flags,
+    xnn_operator_t* multiply_op_out);
+
+enum xnn_status xnn_setup_multiply_nd_f32(
+    xnn_operator_t multiply_op,
+    size_t num_input1_dims,
+    const size_t* input1_shape,
+    size_t num_input2_dims,
+    const size_t* input2_shape,
+    const float* input1,
+    const float* input2,
+    float* output,
+    pthreadpool_t threadpool);
+
 enum xnn_status xnn_create_prelu_nc_f32(
     size_t channels,
     size_t input_stride,
diff --git a/src/init.c b/src/init.c
index 9ff1fa9..72fecb9 100644
--- a/src/init.c
+++ b/src/init.c
@@ -206,6 +206,12 @@
       .channel_tile = 8,
     };
     xnn_params.f32.vadd = (xnn_vadd_ukernel_function) xnn_f32_vadd_ukernel__neon_x8;
+    xnn_params.f32.vmul = (struct vbinop_parameters) {
+      .op_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmul_ukernel__neon_x8,
+      .opc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
+      .ropc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
+      .element_tile = 8,
+    };
     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c4__neon_2x,
       .channel_tile = 4,
@@ -463,6 +469,12 @@
       .channel_tile = 8,
     };
     xnn_params.f32.vadd = (xnn_vadd_ukernel_function) xnn_f32_vadd_ukernel__neon_x8;
+    xnn_params.f32.vmul = (struct vbinop_parameters) {
+      .op_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmul_ukernel__neon_x8,
+      .opc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
+      .ropc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__neon_x8,
+      .element_tile = 8,
+    };
     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c4__neonfma_2x,
       .channel_tile = 4,
@@ -674,6 +686,12 @@
       .channel_tile = 8,
     };
     xnn_params.f32.vadd = (xnn_vadd_ukernel_function) xnn_f32_vadd_ukernel__sse_x8;
+    xnn_params.f32.vmul = (struct vbinop_parameters) {
+      .op_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmul_ukernel__sse_x8,
+      .opc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__sse_x8,
+      .ropc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__sse_x8,
+      .element_tile = 8,
+    };
     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c4__sse_2x,
       .channel_tile = 4,
@@ -869,6 +887,12 @@
       .channel_tile = 8,
     };
     xnn_params.f32.vadd = (xnn_vadd_ukernel_function) xnn_f32_vadd_ukernel__psimd_x8;
+    xnn_params.f32.vmul = (struct vbinop_parameters) {
+      .op_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmul_ukernel__psimd_x8,
+      .opc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__psimd_x8,
+      .ropc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__psimd_x8,
+      .element_tile = 8,
+    };
     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c4__psimd_2x,
       .channel_tile = 4,
@@ -1039,6 +1063,12 @@
       .channel_tile = 4,
     };
     xnn_params.f32.vadd = (xnn_vadd_ukernel_function) xnn_f32_vadd_ukernel__scalar_x4;
+    xnn_params.f32.vmul = (struct vbinop_parameters) {
+      .op_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmul_ukernel__scalar_x4,
+      .opc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__scalar_x4,
+      .ropc_ukernel = (xnn_vbinop_ukernel_function) xnn_f32_vmulc_ukernel__scalar_x4,
+      .element_tile = 8,
+    };
     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
       .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c1__scalar_2x,
       .channel_tile = 1,
diff --git a/src/multiply.c b/src/multiply.c
new file mode 100644
index 0000000..a49646c
--- /dev/null
+++ b/src/multiply.c
@@ -0,0 +1,241 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <xnnpack.h>
+#include <xnnpack/allocator.h>
+#include <xnnpack/log.h>
+#include <xnnpack/operator.h>
+#include <xnnpack/params-init.h>
+#include <xnnpack/params.h>
+
+
+enum xnn_status xnn_create_multiply_nd_f32(
+    float output_min,
+    float output_max,
+    uint32_t flags,
+    xnn_operator_t* multiply_op_out)
+{
+  xnn_operator_t multiply_op = NULL;
+  enum xnn_status status = xnn_status_uninitialized;
+
+  if (!xnn_params.initialized) {
+    xnn_log_error("failed to create Multiply operator: XNNPACK is not initialized");
+    goto error;
+  }
+
+  status = xnn_status_invalid_parameter;
+
+  if (isnan(output_min)) {
+    xnn_log_error(
+      "failed to create Multiply operator with NaN output lower bound: lower bound must be non-NaN");
+    goto error;
+  }
+
+  if (isnan(output_max)) {
+    xnn_log_error(
+      "failed to create Multiply operator with NaN output upper bound: upper bound must be non-NaN");
+    goto error;
+  }
+
+  if (output_min >= output_max) {
+    xnn_log_error(
+      "failed to create Multiply operator with [%.7g, %.7g] output range: lower bound must be below upper bound",
+      output_min, output_max);
+    goto error;
+  }
+
+  status = xnn_status_out_of_memory;
+
+  multiply_op = xnn_allocate_zero_memory(sizeof(struct xnn_operator));
+  if (multiply_op == NULL) {
+    xnn_log_error("failed to allocate %zu bytes for Multiply operator descriptor", sizeof(struct xnn_operator));
+    goto error;
+  }
+
+  multiply_op->f32_output_params = xnn_init_f32_output_params(output_min, output_max);
+
+  multiply_op->type = xnn_operator_type_multiply_f32;
+  multiply_op->ukernel.type = xnn_ukernel_type_multiply;
+
+  multiply_op->state = xnn_run_state_invalid;
+
+  *multiply_op_out = multiply_op;
+  return xnn_status_success;
+
+error:
+  xnn_delete_operator(multiply_op);
+  return status;
+}
+
+enum xnn_status xnn_setup_multiply_nd_f32(
+    xnn_operator_t multiply_op,
+    size_t num_input1_dims,
+    const size_t* input1_shape,
+    size_t num_input2_dims,
+    const size_t* input2_shape,
+    const float* input1,
+    const float* input2,
+    float* output,
+    pthreadpool_t threadpool)
+{
+  if (multiply_op->type != xnn_operator_type_multiply_f32) {
+    xnn_log_error("failed to setup Multiply (F32) operator: operator type mismatch");
+    return xnn_status_invalid_parameter;
+  }
+  multiply_op->state = xnn_run_state_invalid;
+
+  if (!xnn_params.initialized) {
+    xnn_log_error("failed to setup Multiply operator: XNNPACK is not initialized");
+    return xnn_status_uninitialized;
+  }
+
+  if (max(num_input1_dims, num_input2_dims) > 4) {
+    xnn_log_error(
+      "failed to setup Multiply operator with %zu and %zu dimensions in input shapes: "
+      "the number of input dimensions must not exceed 4",
+      num_input1_dims, num_input2_dims);
+    return xnn_status_unsupported_parameter;
+  }
+
+  for (size_t i = 0; i < num_input1_dims; i++) {
+    if (input1_shape[i] == 0) {
+      xnn_log_error("failed to setup Multiply operator: shape dimension #%zu of input #1 is zero", i);
+      return xnn_status_invalid_parameter;
+    }
+  }
+
+  for (size_t i = 0; i < num_input2_dims; i++) {
+    if (input2_shape[i] == 0) {
+      xnn_log_error("failed to setup Multiply operator: shape dimension #%zu of input #2 is zero", i);
+      return xnn_status_invalid_parameter;
+    }
+  }
+
+  size_t num_compressed_dims = 0;
+  size_t compressed_input1_shape[XNN_MAX_TENSOR_DIMS];
+  size_t compressed_input2_shape[XNN_MAX_TENSOR_DIMS];
+  size_t compressed_output_shape[XNN_MAX_TENSOR_DIMS];
+  for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) {
+    compressed_input1_shape[i] = 1;
+    compressed_input2_shape[i] = 1;
+    compressed_output_shape[i] = 1;
+  }
+  bool broadcast_input1 = false;
+  bool broadcast_input2 = false;
+  bool first_nonunit = true;
+  const size_t num_common_dims = min(num_input1_dims, num_input2_dims);
+  for (size_t i = 1; i <= num_common_dims; i++) {
+    const size_t input1_dim = input1_shape[num_input1_dims - i];
+    const size_t input2_dim = input2_shape[num_input2_dims - i];
+    if (input1_dim == 1 && input2_dim == 1) {
+      continue;
+    }
+    assert(!broadcast_input1 || !broadcast_input2);
+
+    if (input1_dim == 1) {
+      if (!broadcast_input1) {
+        broadcast_input1 = true;
+        broadcast_input2 = false;
+        num_compressed_dims++;
+      }
+      compressed_input2_shape[num_compressed_dims - 1] *= input2_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input2_dim;
+    } else if (input2_dim == 1) {
+      if (!broadcast_input2) {
+        broadcast_input1 = false;
+        broadcast_input2 = true;
+        num_compressed_dims++;
+      }
+      compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
+    } else if (input1_dim == input2_dim) {
+      if (broadcast_input1 || broadcast_input2 || first_nonunit) {
+        broadcast_input1 = false;
+        broadcast_input2 = false;
+        num_compressed_dims++;
+      }
+      compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
+      compressed_input2_shape[num_compressed_dims - 1] *= input1_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
+    } else {
+      xnn_log_error("failed to setup Multiply operator: "
+        "shape dimension #%zu of input1 (%zu) does not match shape dimension #%zu of input2 (%zu)",
+        num_input1_dims - i, input1_dim, num_input2_dims - i, input2_dim);
+      return xnn_status_invalid_parameter;
+    }
+    first_nonunit = false;
+  }
+  if (num_input1_dims > num_input2_dims) {
+    if (!broadcast_input2) {
+      num_compressed_dims++;
+    }
+    for (size_t i = 0; i < num_input1_dims - num_input2_dims; i++) {
+      const size_t input1_dim = input1_shape[i];
+      compressed_input1_shape[num_compressed_dims - 1] *= input1_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input1_dim;
+    }
+  } else if (num_input2_dims > num_input1_dims) {
+    if (!broadcast_input1) {
+      num_compressed_dims++;
+    }
+    for (size_t i = 0; i < num_input2_dims - num_input1_dims; i++) {
+      const size_t input2_dim = input2_shape[i];
+      compressed_input2_shape[num_compressed_dims - 1] *= input2_dim;
+      compressed_output_shape[num_compressed_dims - 1] *= input2_dim;
+    }
+  }
+  num_compressed_dims = max(num_compressed_dims, 1);
+
+  multiply_op->context.elementwise_binary = (struct elementwise_binary_context) {
+    .a = input1,
+    .b = input2,
+    .y = output,
+    .elements = compressed_output_shape[0] * sizeof(float),
+    .params.f32 = multiply_op->f32_output_params,
+  };
+  const size_t* compressed_a_shape = compressed_input1_shape;
+  const size_t* compressed_b_shape = compressed_input2_shape;
+  if (compressed_input1_shape[0] == 1) {
+    multiply_op->context.elementwise_binary.ukernel = xnn_params.f32.vmul.ropc_ukernel;
+    multiply_op->context.elementwise_binary.a = input2;
+    multiply_op->context.elementwise_binary.b = input1;
+    compressed_a_shape = compressed_input2_shape;
+    compressed_b_shape = compressed_input1_shape;
+  } else if (compressed_input2_shape[0] == 1) {
+    multiply_op->context.elementwise_binary.ukernel = xnn_params.f32.vmul.opc_ukernel;
+  } else if (compressed_input1_shape[0] == compressed_input2_shape[0]) {
+    multiply_op->context.elementwise_binary.ukernel = xnn_params.f32.vmul.op_ukernel;
+  }
+  size_t a_stride = compressed_a_shape[0], b_stride = compressed_b_shape[0], y_stride = compressed_output_shape[0];
+  for (size_t i = 1; i < num_compressed_dims; i++) {
+    if (compressed_a_shape[i] != 1) {
+      multiply_op->context.elementwise_binary.a_stride[XNN_MAX_TENSOR_DIMS - 1 - i] = a_stride * sizeof(float);
+    }
+    if (compressed_b_shape[i] != 1) {
+      multiply_op->context.elementwise_binary.b_stride[XNN_MAX_TENSOR_DIMS - 1 - i] = b_stride * sizeof(float);
+    }
+    multiply_op->context.elementwise_binary.y_stride[XNN_MAX_TENSOR_DIMS - 1 - i] = y_stride * sizeof(float);
+    a_stride *= compressed_a_shape[i];
+    b_stride *= compressed_b_shape[i];
+    y_stride *= compressed_output_shape[i];
+  }
+
+  multiply_op->compute.type = xnn_parallelization_type_3d_tile_2d;
+  multiply_op->compute.task_3d_tile_2d = (pthreadpool_task_3d_tile_2d_t) xnn_compute_elementwise_binary_3d;
+  multiply_op->compute.range[0] = compressed_output_shape[3];
+  multiply_op->compute.range[1] = compressed_output_shape[2];
+  multiply_op->compute.range[2] = compressed_output_shape[1];
+  multiply_op->compute.tile[0] = 1;
+  multiply_op->compute.tile[1] = 1;
+  multiply_op->state = xnn_run_state_ready;
+
+  return xnn_status_success;
+}
diff --git a/src/operator-run.c b/src/operator-run.c
index 6be0527..b331521 100644
--- a/src/operator-run.c
+++ b/src/operator-run.c
@@ -562,6 +562,23 @@
   context->ukernel(size, a, b, y, &context->params);
 }
 
+void xnn_compute_elementwise_binary_3d(
+    const struct elementwise_binary_context context[restrict static 1],
+    size_t i, size_t j, size_t k,
+    size_t j_range, size_t k_range)
+{
+  assert(j_range == 1);
+  assert(k_range == 1);
+
+  const void* a = (const void*) ((uintptr_t) context->a +
+    i * context->a_stride[0] + j * context->a_stride[1] + k * context->a_stride[2]);
+  const void* b = (const void*) ((uintptr_t) context->b +
+    i * context->b_stride[0] + j * context->b_stride[1] + k * context->b_stride[2]);
+  void* y = (void*) ((uintptr_t) context->y +
+    i * context->y_stride[0] + j * context->y_stride[1] + k * context->y_stride[2]);
+  context->ukernel(context->elements, a, b, y, &context->params);
+}
+
 void xnn_compute_channel_shuffle_fixed(
     const struct channel_shuffle_context context[restrict static 1],
     size_t index)
diff --git a/src/xnnpack/compute.h b/src/xnnpack/compute.h
index 3397153..c87b143 100644
--- a/src/xnnpack/compute.h
+++ b/src/xnnpack/compute.h
@@ -569,6 +569,27 @@
       size_t size);
 #endif
 
+struct elementwise_binary_context {
+  const void* a;
+  size_t a_stride[XNN_MAX_TENSOR_DIMS - 1];
+  const void* b;
+  size_t b_stride[XNN_MAX_TENSOR_DIMS - 1];
+  void* y;
+  size_t y_stride[XNN_MAX_TENSOR_DIMS - 1];
+  size_t elements;
+  union {
+    union xnn_q8_add_params q8;
+    union xnn_f32_output_params f32;
+  } params;
+  xnn_vbinop_ukernel_function ukernel;
+};
+
+#ifndef __cplusplus
+  XNN_PRIVATE void xnn_compute_elementwise_binary_3d(
+      const struct elementwise_binary_context context[restrict static 1],
+      size_t i, size_t j, size_t k, size_t j_range, size_t k_range);
+#endif
+
 struct channel_shuffle_context {
   const void* x;
   size_t x_stride;
diff --git a/src/xnnpack/operator.h b/src/xnnpack/operator.h
index 6f3c6fa..ffe9e3a 100644
--- a/src/xnnpack/operator.h
+++ b/src/xnnpack/operator.h
@@ -32,6 +32,7 @@
   xnn_ukernel_type_igemm,
   xnn_ukernel_type_lut,
   xnn_ukernel_type_max_pooling,
+  xnn_ukernel_type_multiply,
   xnn_ukernel_type_pad,
   xnn_ukernel_type_pixelwise_average_pooling,
   xnn_ukernel_type_prelu,
@@ -69,6 +70,7 @@
   xnn_operator_type_leaky_relu_q8,
   xnn_operator_type_max_pooling_f32,
   xnn_operator_type_max_pooling_u8,
+  xnn_operator_type_multiply_f32,
   xnn_operator_type_prelu_f32,
   xnn_operator_type_resize_bilinear_f32,
   xnn_operator_type_sigmoid_f32,
@@ -256,6 +258,7 @@
     struct dconv2d_context dconv2d;
     struct dwconv2d_context dwconv2d;
     struct dwconv_context dwconv;
+    struct elementwise_binary_context elementwise_binary;
     struct gemm_context gemm;
     struct global_average_pooling_context global_average_pooling;
     struct global_average_pooling_spnchw_context global_average_pooling_spnchw;
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index b40feef..028e3ba 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -1187,6 +1187,15 @@
   uint8_t log2_sr;
 };
 
+struct vbinop_parameters {
+  xnn_vbinop_ukernel_function op_ukernel;
+  xnn_vbinop_ukernel_function opc_ukernel;
+  xnn_vbinop_ukernel_function ropc_ukernel;
+  // Number of elements in a tile.
+  // For best efficiency, micro-kernel must process a multiple of this number of elements in each call.
+  uint8_t element_tile;
+};
+
 struct spmm_parameters {
   xnn_spmm_ukernel_function ukernel;
   // Number of M-dimension elements in a tile.
@@ -1345,6 +1354,7 @@
     xnn_univector_ukernel_function sigmoid;
     struct prelu_parameters prelu;
     xnn_vadd_ukernel_function vadd;
+    struct vbinop_parameters vmul;
     struct vmulcaddc_parameters vmulcaddc;
     // Sparse Matrix-Dense Matrix Multiplication (NR=1 block).
     struct spmm_parameters spmm;
diff --git a/test/multiply-operator-tester.h b/test/multiply-operator-tester.h
new file mode 100644
index 0000000..dec141d
--- /dev/null
+++ b/test/multiply-operator-tester.h
@@ -0,0 +1,216 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+#include <functional>
+#include <initializer_list>
+#include <limits>
+#include <random>
+#include <vector>
+
+#include <xnnpack.h>
+
+
+class MultiplyOperatorTester {
+ public:
+  inline MultiplyOperatorTester& input1_shape(std::initializer_list<size_t> input1_shape) {
+    assert(input1_shape.size() <= XNN_MAX_TENSOR_DIMS);
+    this->input1_shape_ = std::vector<size_t>(input1_shape);
+    return *this;
+  }
+
+  inline const std::vector<size_t>& input1_shape() const {
+    return this->input1_shape_;
+  }
+
+  inline size_t input1_dim(size_t i) const {
+    return i < num_input1_dims() ? this->input1_shape_[i] : 1;
+  }
+
+  inline size_t num_input1_dims() const {
+    return this->input1_shape_.size();
+  }
+
+  inline size_t num_input1_elements() const {
+    return std::accumulate(
+      this->input1_shape_.begin(), this->input1_shape_.end(), size_t(1), std::multiplies<size_t>());
+  }
+
+  inline MultiplyOperatorTester& input2_shape(std::initializer_list<size_t> input2_shape) {
+    assert(input2_shape.size() <= XNN_MAX_TENSOR_DIMS);
+    this->input2_shape_ = std::vector<size_t>(input2_shape);
+    return *this;
+  }
+
+  inline const std::vector<size_t>& input2_shape() const {
+    return this->input2_shape_;
+  }
+
+  inline size_t input2_dim(size_t i) const {
+    return i < num_input2_dims() ? this->input2_shape_[i] : 1;
+  }
+
+  inline size_t num_input2_dims() const {
+    return this->input2_shape_.size();
+  }
+
+  inline size_t num_input2_elements() const {
+    return std::accumulate(
+      this->input2_shape_.begin(), this->input2_shape_.end(), size_t(1), std::multiplies<size_t>());
+  }
+
+  inline MultiplyOperatorTester& qmin(uint8_t qmin) {
+    this->qmin_ = qmin;
+    return *this;
+  }
+
+  inline uint8_t qmin() const {
+    return this->qmin_;
+  }
+
+  inline MultiplyOperatorTester& qmax(uint8_t qmax) {
+    this->qmax_ = qmax;
+    return *this;
+  }
+
+  inline uint8_t qmax() const {
+    return this->qmax_;
+  }
+
+  inline MultiplyOperatorTester& iterations(size_t iterations) {
+    this->iterations_ = iterations;
+    return *this;
+  }
+
+  inline size_t iterations() const {
+    return this->iterations_;
+  }
+
+  void TestF32() const {
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto f32rng = std::bind(std::uniform_real_distribution<float>(0.0f, 1.0f), rng);
+
+    // Compute generalized shapes.
+    std::array<size_t, XNN_MAX_TENSOR_DIMS> input1_dims;
+    std::array<size_t, XNN_MAX_TENSOR_DIMS> input2_dims;
+    std::array<size_t, XNN_MAX_TENSOR_DIMS> output_dims;
+    std::fill(input1_dims.begin(), input1_dims.end(), 1);
+    std::fill(input2_dims.begin(), input2_dims.end(), 1);
+    std::fill(output_dims.begin(), output_dims.end(), 1);
+    std::copy(input1_shape().cbegin(), input1_shape().cend(), input1_dims.end() - num_input1_dims());
+    std::copy(input2_shape().cbegin(), input2_shape().cend(), input2_dims.end() - num_input2_dims());
+    for (size_t i = 0; i < XNN_MAX_TENSOR_DIMS; i++) {
+      if (input1_dims[i] != 1 && input2_dims[i] != 1) {
+        ASSERT_EQ(input1_dims[i], input2_dims[i]);
+      }
+      output_dims[i] = std::max(input1_dims[i], input2_dims[i]);
+    }
+    const size_t num_output_elements =
+      std::accumulate(output_dims.begin(), output_dims.end(), size_t(1), std::multiplies<size_t>());
+
+    // Compute generalized strides.
+    std::array<size_t, XNN_MAX_TENSOR_DIMS> input1_strides;
+    std::array<size_t, XNN_MAX_TENSOR_DIMS> input2_strides;
+    std::array<size_t, XNN_MAX_TENSOR_DIMS> output_strides;
+    size_t input1_stride = 1, input2_stride = 1, output_stride = 1;
+    for (size_t i = XNN_MAX_TENSOR_DIMS; i != 0; i--) {
+      input1_strides[i - 1] = input1_dims[i - 1] == 1 ? 0 : input1_stride;
+      input2_strides[i - 1] = input2_dims[i - 1] == 1 ? 0 : input2_stride;
+      output_strides[i - 1] = output_stride;
+      input1_stride *= input1_dims[i - 1];
+      input2_stride *= input2_dims[i - 1];
+      output_stride *= output_dims[i - 1];
+    }
+
+    std::vector<float> input1(XNN_EXTRA_BYTES / sizeof(float) + num_input1_elements());
+    std::vector<float> input2(XNN_EXTRA_BYTES / sizeof(float) + num_input2_elements());
+    std::vector<float> output(num_output_elements);
+    std::vector<float> output_ref(num_output_elements);
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      std::generate(input1.begin(), input1.end(), std::ref(f32rng));
+      std::generate(input2.begin(), input2.end(), std::ref(f32rng));
+      std::fill(output.begin(), output.end(), nanf(""));
+
+      // Compute reference results.
+      for (size_t i = 0; i < output_dims[0]; i++) {
+        for (size_t j = 0; j < output_dims[1]; j++) {
+          for (size_t k = 0; k < output_dims[2]; k++) {
+            for (size_t l = 0; l < output_dims[3]; l++) {
+              output_ref[i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3]] =
+                input1[i * input1_strides[0] + j * input1_strides[1] + k * input1_strides[2] + l * input1_strides[3]] *
+                input2[i * input2_strides[0] + j * input2_strides[1] + k * input2_strides[2] + l * input2_strides[3]];
+            }
+          }
+        }
+      }
+      const float accumulated_min = *std::min_element(output_ref.cbegin(), output_ref.cend());
+      const float accumulated_max = *std::max_element(output_ref.cbegin(), output_ref.cend());
+      const float accumulated_range = accumulated_max - accumulated_min;
+      const float output_min = num_output_elements == 1 ?
+        -std::numeric_limits<float>::infinity() : accumulated_min + accumulated_range / 255.0f * float(qmin());
+      const float output_max = num_output_elements == 1 ?
+        +std::numeric_limits<float>::infinity() : accumulated_max - accumulated_range / 255.0f * float(255 - qmax());
+      for (float& output_value : output_ref) {
+        output_value = std::min(std::max(output_value, output_min), output_max);
+      }
+
+      // Create, setup, run, and destroy Multiply operator.
+      ASSERT_EQ(xnn_status_success, xnn_initialize());
+      xnn_operator_t multiply_op = nullptr;
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_create_multiply_nd_f32(
+          output_min, output_max,
+          0, &multiply_op));
+      ASSERT_NE(nullptr, multiply_op);
+
+      // Smart pointer to automatically delete multiply_op.
+      std::unique_ptr<xnn_operator, decltype(&xnn_delete_operator)> auto_multiply_op(multiply_op, xnn_delete_operator);
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_setup_multiply_nd_f32(
+          multiply_op,
+          num_input1_dims(),
+          input1_shape().data(),
+          num_input2_dims(),
+          input2_shape().data(),
+          input1.data(), input2.data(), output.data(),
+          nullptr /* thread pool */));
+
+      ASSERT_EQ(xnn_status_success,
+        xnn_run_operator(multiply_op, nullptr /* thread pool */));
+
+      // Verify results.
+      for (size_t i = 0; i < output_dims[0]; i++) {
+        for (size_t j = 0; j < output_dims[1]; j++) {
+          for (size_t k = 0; k < output_dims[2]; k++) {
+            for (size_t l = 0; l < output_dims[3]; l++) {
+              const size_t index =
+                i * output_strides[0] + j * output_strides[1] + k * output_strides[2] + l * output_strides[3];
+              ASSERT_NEAR(output[index], output_ref[index], 1.0e-6f * std::abs(output_ref[index]))
+                << "(i, j, k, l) = (" << i << ", " << j << ", " << k << ", " << l << ")";
+            }
+          }
+        }
+      }
+    }
+  }
+
+ private:
+  std::vector<size_t> input1_shape_;
+  std::vector<size_t> input2_shape_;
+  uint8_t qmin_{0};
+  uint8_t qmax_{255};
+  size_t iterations_{5};
+};
diff --git a/test/multiply.cc b/test/multiply.cc
new file mode 100644
index 0000000..2c7bb72
--- /dev/null
+++ b/test/multiply.cc
@@ -0,0 +1,125 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <gtest/gtest.h>
+
+#include "multiply-operator-tester.h"
+
+
+TEST(MULTIPLY_OP_F32, vector_x_vector) {
+  MultiplyOperatorTester()
+    .input1_shape({2})
+    .input2_shape({2})
+    .iterations(1)
+    .TestF32();
+}
+
+TEST(MULTIPLY_OP_F32, vector_x_scalar) {
+  MultiplyOperatorTester()
+    .input1_shape({2})
+    .iterations(1)
+    .TestF32();
+  MultiplyOperatorTester()
+    .input1_shape({2})
+    .input2_shape({1})
+    .iterations(1)
+    .TestF32();
+}
+
+TEST(MULTIPLY_OP_F32, scalar_x_vector) {
+  MultiplyOperatorTester()
+    .input2_shape({2})
+    .iterations(1)
+    .TestF32();
+  MultiplyOperatorTester()
+    .input1_shape({1})
+    .input2_shape({2})
+    .iterations(1)
+    .TestF32();
+}
+
+TEST(MULTIPLY_OP_F32, matrix_x_matrix) {
+  MultiplyOperatorTester()
+    .input1_shape({2, 3})
+    .input2_shape({2, 3})
+    .iterations(1)
+    .TestF32();
+}
+
+TEST(MULTIPLY_OP_F32, matrix_x_row) {
+  MultiplyOperatorTester()
+    .input1_shape({2, 3})
+    .input2_shape({1, 3})
+    .iterations(1)
+    .TestF32();
+  MultiplyOperatorTester()
+    .input1_shape({2, 3})
+    .input2_shape({3})
+    .iterations(1)
+    .TestF32();
+}
+
+TEST(MULTIPLY_OP_F32, matrix_x_column) {
+  MultiplyOperatorTester()
+    .input1_shape({2, 3})
+    .input2_shape({2, 1})
+    .iterations(1)
+    .TestF32();
+}
+
+TEST(MULTIPLY_OP_F32, matrix_x_scalar) {
+  MultiplyOperatorTester()
+    .input1_shape({2, 3})
+    .input2_shape({1, 1})
+    .iterations(1)
+    .TestF32();
+  MultiplyOperatorTester()
+    .input1_shape({2, 3})
+    .input2_shape({1})
+    .iterations(1)
+    .TestF32();
+  MultiplyOperatorTester()
+    .input1_shape({2, 3})
+    .iterations(1)
+    .TestF32();
+}
+
+TEST(MULTIPLY_OP_F32, row_x_matrix) {
+  MultiplyOperatorTester()
+    .input1_shape({1, 3})
+    .input2_shape({2, 3})
+    .iterations(1)
+    .TestF32();
+  MultiplyOperatorTester()
+    .input1_shape({3})
+    .input2_shape({2, 3})
+    .iterations(1)
+    .TestF32();
+}
+
+TEST(MULTIPLY_OP_F32, column_x_matrix) {
+  MultiplyOperatorTester()
+    .input1_shape({2, 1})
+    .input2_shape({2, 3})
+    .iterations(1)
+    .TestF32();
+}
+
+TEST(MULTIPLY_OP_F32, scalar_x_matrix) {
+  MultiplyOperatorTester()
+    .input1_shape({1, 1})
+    .input2_shape({2, 3})
+    .iterations(1)
+    .TestF32();
+  MultiplyOperatorTester()
+    .input1_shape({1})
+    .input2_shape({2, 3})
+    .iterations(1)
+    .TestF32();
+  MultiplyOperatorTester()
+    .input2_shape({2, 3})
+    .iterations(1)
+    .TestF32();
+}