Add binary op microkernels with RELU activation

PiperOrigin-RevId: 325607697
diff --git a/src/f32-vbinary/gen/vadd-relu-scalar-x1.c b/src/f32-vbinary/gen/vadd-relu-scalar-x1.c
new file mode 100644
index 0000000..d701332
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va + vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-scalar-x2.c b/src/f32-vbinary/gen/vadd-relu-scalar-x2.c
new file mode 100644
index 0000000..8feca13
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-scalar-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va + vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-scalar-x4.c b/src/f32-vbinary/gen/vadd-relu-scalar-x4.c
new file mode 100644
index 0000000..11131c0
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-scalar-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+    float vy2 = va2 + vb2;
+    float vy3 = va3 + vb3;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va + vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-wasm-x1.c b/src/f32-vbinary/gen/vadd-relu-wasm-x1.c
new file mode 100644
index 0000000..45bbe36
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va + vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-wasm-x2.c b/src/f32-vbinary/gen/vadd-relu-wasm-x2.c
new file mode 100644
index 0000000..ab69037
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-wasm-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va + vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-wasm-x4.c b/src/f32-vbinary/gen/vadd-relu-wasm-x4.c
new file mode 100644
index 0000000..fd776af
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-wasm-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+    float vy2 = va2 + vb2;
+    float vy3 = va3 + vb3;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va + vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vadd-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..2710ce0
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-wasmsimd-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb0123);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vadd-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..10d58bb
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-wasmsimd-x8.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    b += 8;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_add(va4567, vb4567);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-scalar-x1.c b/src/f32-vbinary/gen/vaddc-relu-scalar-x1.c
new file mode 100644
index 0000000..309398e
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va + vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-scalar-x2.c b/src/f32-vbinary/gen/vaddc-relu-scalar-x2.c
new file mode 100644
index 0000000..2247e79
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-scalar-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va + vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-scalar-x4.c b/src/f32-vbinary/gen/vaddc-relu-scalar-x4.c
new file mode 100644
index 0000000..36ea948
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-scalar-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+    float vy2 = va2 + vb;
+    float vy3 = va3 + vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va + vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-wasm-x1.c b/src/f32-vbinary/gen/vaddc-relu-wasm-x1.c
new file mode 100644
index 0000000..de88639
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va + vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-wasm-x2.c b/src/f32-vbinary/gen/vaddc-relu-wasm-x2.c
new file mode 100644
index 0000000..638d8c5
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-wasm-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va + vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-wasm-x4.c b/src/f32-vbinary/gen/vaddc-relu-wasm-x4.c
new file mode 100644
index 0000000..275891b
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-wasm-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+    float vy2 = va2 + vb;
+    float vy3 = va3 + vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va + vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..e1f7fdf
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x4.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..eb52c12
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_add(va4567, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-scalar-x1.c b/src/f32-vbinary/gen/vdiv-relu-scalar-x1.c
new file mode 100644
index 0000000..fe1c875
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va / vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-scalar-x2.c b/src/f32-vbinary/gen/vdiv-relu-scalar-x2.c
new file mode 100644
index 0000000..ce988e4
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-scalar-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va / vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-scalar-x4.c b/src/f32-vbinary/gen/vdiv-relu-scalar-x4.c
new file mode 100644
index 0000000..db115d3
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-scalar-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+    float vy2 = va2 / vb2;
+    float vy3 = va3 / vb3;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va / vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-wasm-x1.c b/src/f32-vbinary/gen/vdiv-relu-wasm-x1.c
new file mode 100644
index 0000000..94ef79e
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va / vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-wasm-x2.c b/src/f32-vbinary/gen/vdiv-relu-wasm-x2.c
new file mode 100644
index 0000000..4e70db1
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-wasm-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va / vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-wasm-x4.c b/src/f32-vbinary/gen/vdiv-relu-wasm-x4.c
new file mode 100644
index 0000000..6683ed4
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-wasm-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+    float vy2 = va2 / vb2;
+    float vy3 = va3 / vb3;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va / vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..0f6ad66
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb0123);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..95d66a9
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x8.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    b += 8;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_div(va4567, vb4567);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-scalar-x1.c b/src/f32-vbinary/gen/vdivc-relu-scalar-x1.c
new file mode 100644
index 0000000..73f22a7
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va / vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-scalar-x2.c b/src/f32-vbinary/gen/vdivc-relu-scalar-x2.c
new file mode 100644
index 0000000..9f27717
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-scalar-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va / vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-scalar-x4.c b/src/f32-vbinary/gen/vdivc-relu-scalar-x4.c
new file mode 100644
index 0000000..74bf59b
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-scalar-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+    float vy2 = va2 / vb;
+    float vy3 = va3 / vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va / vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-wasm-x1.c b/src/f32-vbinary/gen/vdivc-relu-wasm-x1.c
new file mode 100644
index 0000000..3c917ae
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va / vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-wasm-x2.c b/src/f32-vbinary/gen/vdivc-relu-wasm-x2.c
new file mode 100644
index 0000000..1f902ae
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-wasm-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va / vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-wasm-x4.c b/src/f32-vbinary/gen/vdivc-relu-wasm-x4.c
new file mode 100644
index 0000000..8e3f484
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-wasm-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+    float vy2 = va2 / vb;
+    float vy3 = va3 / vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va / vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..6567e78
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x4.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..ad68670
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_div(va4567, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-scalar-x1.c b/src/f32-vbinary/gen/vmul-relu-scalar-x1.c
new file mode 100644
index 0000000..29622c6
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va * vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-scalar-x2.c b/src/f32-vbinary/gen/vmul-relu-scalar-x2.c
new file mode 100644
index 0000000..48f6bcf
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-scalar-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va * vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-scalar-x4.c b/src/f32-vbinary/gen/vmul-relu-scalar-x4.c
new file mode 100644
index 0000000..d0acbb8
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-scalar-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+    float vy2 = va2 * vb2;
+    float vy3 = va3 * vb3;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va * vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-wasm-x1.c b/src/f32-vbinary/gen/vmul-relu-wasm-x1.c
new file mode 100644
index 0000000..2d34fed
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va * vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-wasm-x2.c b/src/f32-vbinary/gen/vmul-relu-wasm-x2.c
new file mode 100644
index 0000000..d6f8677
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-wasm-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va * vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-wasm-x4.c b/src/f32-vbinary/gen/vmul-relu-wasm-x4.c
new file mode 100644
index 0000000..20471db
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-wasm-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+    float vy2 = va2 * vb2;
+    float vy3 = va3 * vb3;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va * vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vmul-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..e3b236a
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-wasmsimd-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb0123);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vmul-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..2fe6699
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-wasmsimd-x8.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    b += 8;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_mul(va4567, vb4567);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-scalar-x1.c b/src/f32-vbinary/gen/vmulc-relu-scalar-x1.c
new file mode 100644
index 0000000..26b126d
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va * vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-scalar-x2.c b/src/f32-vbinary/gen/vmulc-relu-scalar-x2.c
new file mode 100644
index 0000000..0a81915
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-scalar-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va * vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-scalar-x4.c b/src/f32-vbinary/gen/vmulc-relu-scalar-x4.c
new file mode 100644
index 0000000..4e52ae1
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-scalar-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+    float vy2 = va2 * vb;
+    float vy3 = va3 * vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va * vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-wasm-x1.c b/src/f32-vbinary/gen/vmulc-relu-wasm-x1.c
new file mode 100644
index 0000000..b16bbb6
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va * vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-wasm-x2.c b/src/f32-vbinary/gen/vmulc-relu-wasm-x2.c
new file mode 100644
index 0000000..ef4d8b6
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-wasm-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va * vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-wasm-x4.c b/src/f32-vbinary/gen/vmulc-relu-wasm-x4.c
new file mode 100644
index 0000000..1167faa
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-wasm-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+    float vy2 = va2 * vb;
+    float vy3 = va3 * vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va * vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..54f6fcb
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x4.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..7d10ad8
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_mul(va4567, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-scalar-x1.c b/src/f32-vbinary/gen/vrdivc-relu-scalar-x1.c
new file mode 100644
index 0000000..3c8adb2
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = vb / va;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-scalar-x2.c b/src/f32-vbinary/gen/vrdivc-relu-scalar-x2.c
new file mode 100644
index 0000000..965c307
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-scalar-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = vb / va;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-scalar-x4.c b/src/f32-vbinary/gen/vrdivc-relu-scalar-x4.c
new file mode 100644
index 0000000..a685c0f
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-scalar-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+    float vy2 = vb / va2;
+    float vy3 = vb / va3;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb / va;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-wasm-x1.c b/src/f32-vbinary/gen/vrdivc-relu-wasm-x1.c
new file mode 100644
index 0000000..a181ad6
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = vb / va;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-wasm-x2.c b/src/f32-vbinary/gen/vrdivc-relu-wasm-x2.c
new file mode 100644
index 0000000..8fbd18b
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-wasm-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = vb / va;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-wasm-x4.c b/src/f32-vbinary/gen/vrdivc-relu-wasm-x4.c
new file mode 100644
index 0000000..b866a19
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-wasm-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+    float vy2 = vb / va2;
+    float vy3 = vb / va3;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb / va;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..15ee424
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x4.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy0123 = wasm_f32x4_div(vb, va0123);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..5affb62
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    v128_t vy0123 = wasm_f32x4_div(vb, va0123);
+    v128_t vy4567 = wasm_f32x4_div(vb, va4567);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-scalar-x1.c b/src/f32-vbinary/gen/vrsubc-relu-scalar-x1.c
new file mode 100644
index 0000000..0ab4b6e
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = vb - va;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-scalar-x2.c b/src/f32-vbinary/gen/vrsubc-relu-scalar-x2.c
new file mode 100644
index 0000000..faf11a2
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-scalar-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = vb - va;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-scalar-x4.c b/src/f32-vbinary/gen/vrsubc-relu-scalar-x4.c
new file mode 100644
index 0000000..8408572
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-scalar-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+    float vy2 = vb - va2;
+    float vy3 = vb - va3;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb - va;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-wasm-x1.c b/src/f32-vbinary/gen/vrsubc-relu-wasm-x1.c
new file mode 100644
index 0000000..37e921f
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = vb - va;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-wasm-x2.c b/src/f32-vbinary/gen/vrsubc-relu-wasm-x2.c
new file mode 100644
index 0000000..e873111
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-wasm-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = vb - va;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-wasm-x4.c b/src/f32-vbinary/gen/vrsubc-relu-wasm-x4.c
new file mode 100644
index 0000000..0aa0d11
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-wasm-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+    float vy2 = vb - va2;
+    float vy3 = vb - va3;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb - va;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..4ea3d1a
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x4.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy0123 = wasm_f32x4_sub(vb, va0123);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..b50a4ac
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    v128_t vy0123 = wasm_f32x4_sub(vb, va0123);
+    v128_t vy4567 = wasm_f32x4_sub(vb, va4567);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-scalar-x1.c b/src/f32-vbinary/gen/vsub-relu-scalar-x1.c
new file mode 100644
index 0000000..2aa104f
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va - vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-scalar-x2.c b/src/f32-vbinary/gen/vsub-relu-scalar-x2.c
new file mode 100644
index 0000000..b09f50f
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-scalar-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va - vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-scalar-x4.c b/src/f32-vbinary/gen/vsub-relu-scalar-x4.c
new file mode 100644
index 0000000..3536ae9
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-scalar-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+    float vy2 = va2 - vb2;
+    float vy3 = va3 - vb3;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va - vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-wasm-x1.c b/src/f32-vbinary/gen/vsub-relu-wasm-x1.c
new file mode 100644
index 0000000..6d8de30
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va - vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-wasm-x2.c b/src/f32-vbinary/gen/vsub-relu-wasm-x2.c
new file mode 100644
index 0000000..8746a1a
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-wasm-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va - vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-wasm-x4.c b/src/f32-vbinary/gen/vsub-relu-wasm-x4.c
new file mode 100644
index 0000000..4b5ea98
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-wasm-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+    float vy2 = va2 - vb2;
+    float vy3 = va3 - vb3;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va - vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vsub-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..a30a889
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-wasmsimd-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb0123);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vsub-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..a69394e
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-wasmsimd-x8.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    b += 8;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb4567);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-scalar-x1.c b/src/f32-vbinary/gen/vsubc-relu-scalar-x1.c
new file mode 100644
index 0000000..3ebb7f3
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va - vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-scalar-x2.c b/src/f32-vbinary/gen/vsubc-relu-scalar-x2.c
new file mode 100644
index 0000000..ab12102
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-scalar-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va - vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-scalar-x4.c b/src/f32-vbinary/gen/vsubc-relu-scalar-x4.c
new file mode 100644
index 0000000..db29dcf
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-scalar-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+    float vy2 = va2 - vb;
+    float vy3 = va3 - vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va - vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-wasm-x1.c b/src/f32-vbinary/gen/vsubc-relu-wasm-x1.c
new file mode 100644
index 0000000..6b26df9
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va - vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-wasm-x2.c b/src/f32-vbinary/gen/vsubc-relu-wasm-x2.c
new file mode 100644
index 0000000..1140931
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-wasm-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va - vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-wasm-x4.c b/src/f32-vbinary/gen/vsubc-relu-wasm-x4.c
new file mode 100644
index 0000000..7e9485c
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-wasm-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+    float vy2 = va2 - vb;
+    float vy3 = va3 - vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va - vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..02154be
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x4.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..4cc1893
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/vop-scalar.c.in b/src/f32-vbinary/vop-scalar.c.in
index 0df0def..3802093 100644
--- a/src/f32-vbinary/vop-scalar.c.in
+++ b/src/f32-vbinary/vop-scalar.c.in
@@ -6,7 +6,7 @@
 $assert BATCH_TILE >= 1
 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF"]
-$assert ACTIVATION in ["LINEAR", "MINMAX"]
+$assert ACTIVATION in ["LINEAR", "MINMAX", "RELU"]
 #include <assert.h>
 
 #include <xnnpack/common.h>
@@ -25,8 +25,8 @@
 $  "SUB": lambda x, y: "%s - %s" % (x, y),
 $  "SQRDIFF": lambda x, y: "%s - %s" % (x, y),
 $}[OP]
-$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION]
-$PARAMS = {"LINEAR": "xnn_f32_default_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION]
+$SUFFIX = {"LINEAR": "", "RELU": "_relu", "MINMAX": "_minmax"}[ACTIVATION]
+$PARAMS = {"LINEAR": "xnn_f32_default_params", "RELU": "xnn_f32_relu_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION]
 void xnn_f32_v${OP.lower()}${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_x${BATCH_TILE}(
     size_t n,
     const float* a,
@@ -67,6 +67,9 @@
 
         $for N in range(BATCH_TILE):
           vy${ABC[N]} = ${MIN_F32}(vy${ABC[N]}, vy_max);
+      $elif ACTIVATION == "RELU":
+        $for N in range(BATCH_TILE):
+          vy${ABC[N]} = ${MAX_F32}(vy${ABC[N]}, 0.0f);
 
       $for N in range(BATCH_TILE):
         y[${N}] = vy${ABC[N]};
@@ -83,6 +86,8 @@
           $if ACTIVATION == "MINMAX":
             vy = ${MAX_F32}(vy, vy_min);
             vy = ${MIN_F32}(vy, vy_max);
+          $elif ACTIVATION == "RELU":
+            vy = ${MAX_F32}(vy, 0.0f);
           *y++ = vy;
           n -= sizeof(float);
         } while (n != 0);
@@ -95,6 +100,8 @@
         $if ACTIVATION == "MINMAX":
           vy = ${MAX_F32}(vy, vy_min);
           vy = ${MIN_F32}(vy, vy_max);
+        $elif ACTIVATION == "RELU":
+          vy = ${MAX_F32}(vy, 0.0f);
         *y = vy;
     }
   $else:
@@ -107,6 +114,8 @@
       $if ACTIVATION == "MINMAX":
         vy = ${MAX_F32}(vy, vy_min);
         vy = ${MIN_F32}(vy, vy_max);
+      $elif ACTIVATION == "RELU":
+        vy = ${MAX_F32}(vy, 0.0f);
       *y++ = vy;
     }
 }
diff --git a/src/f32-vbinary/vop-wasmsimd.c.in b/src/f32-vbinary/vop-wasmsimd.c.in
index 025ced0..8aadc4a 100644
--- a/src/f32-vbinary/vop-wasmsimd.c.in
+++ b/src/f32-vbinary/vop-wasmsimd.c.in
@@ -7,7 +7,7 @@
 $assert BATCH_TILE >= 4
 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 $assert OP in ["ADD", "DIV", "MAX", "MIN", "MUL", "SUB", "SQRDIFF"]
-$assert ACTIVATION in ["LINEAR", "MINMAX"]
+$assert ACTIVATION in ["LINEAR", "MINMAX", "RELU"]
 #include <assert.h>
 
 #include <wasm_simd128.h>
@@ -25,9 +25,9 @@
 $  "SUB": "wasm_f32x4_sub",
 $  "SQRDIFF": "wasm_f32x4_sub",
 $}[OP]
-$ARCH_SUFFIX = "" if ACTIVATION == "LINEAR" and OP not in ["MIN", "MAX"] else "_x86" if X86 else "_arm"
-$ACTIVATION_SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION]
-$PARAMS = {"LINEAR": "xnn_f32_default_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION]
+$ARCH_SUFFIX = "" if ACTIVATION in ["LINEAR", "RELU"] and OP not in ["MIN", "MAX"] else "_x86" if X86 else "_arm"
+$ACTIVATION_SUFFIX = {"LINEAR": ""}.get(ACTIVATION, "_" + ACTIVATION.lower())
+$PARAMS = {"LINEAR": "xnn_f32_default_params", "RELU": "xnn_f32_relu_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION]
 void xnn_f32_v${OP.lower()}${ACTIVATION_SUFFIX}_ukernel__wasmsimd${ARCH_SUFFIX}_x${BATCH_TILE}(
     size_t n,
     const float* a,
@@ -44,6 +44,8 @@
   $if ACTIVATION == "MINMAX":
     const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
     const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  $elif ACTIVATION == "RELU":
+    const v128_t vzero = wasm_f32x4_splat(0.0f);
 
   for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
     const v128_t va${ABC[0:4]} = wasm_v128_load(a);
@@ -93,6 +95,9 @@
 
         $for N in range(0, BATCH_TILE, 4):
           vy${ABC[N:N+4]} = wasm_f32x4_min(vy${ABC[N:N+4]}, vy_max);
+    $elif ACTIVATION == "RELU":
+      $for N in range(0, BATCH_TILE, 4):
+        vy${ABC[N:N+4]} = wasm_i32x4_max(vy${ABC[N:N+4]}, vzero);
 
     wasm_v128_store(y, vy${ABC[0:4]});
     $for N in range(4, BATCH_TILE, 4):
@@ -127,6 +132,8 @@
         $else:
           vy = wasm_f32x4_max(vy, vy_min);
           vy = wasm_f32x4_min(vy, vy_max);
+      $elif ACTIVATION == "RELU":
+        vy = wasm_i32x4_max(vy, vzero);
 
       wasm_v128_store(y, vy);
       y += 4;
@@ -155,6 +162,8 @@
       $else:
         vy = wasm_f32x4_max(vy, vy_min);
         vy = wasm_f32x4_min(vy, vy_max);
+    $elif ACTIVATION == "RELU":
+      vy = wasm_i32x4_max(vy, vzero);
 
     if (n & (2 * sizeof(float))) {
       *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
diff --git a/src/f32-vbinary/vopc-scalar.c.in b/src/f32-vbinary/vopc-scalar.c.in
index bd1453b..f28d446 100644
--- a/src/f32-vbinary/vopc-scalar.c.in
+++ b/src/f32-vbinary/vopc-scalar.c.in
@@ -6,7 +6,7 @@
 $assert BATCH_TILE >= 1
 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF"]
-$assert ACTIVATION in ["LINEAR", "MINMAX"]
+$assert ACTIVATION in ["LINEAR", "MINMAX", "RELU"]
 #include <assert.h>
 
 #include <xnnpack/common.h>
@@ -27,8 +27,8 @@
 $  "RSUB": lambda x: "vb - %s" % x,
 $  "SQRDIFF": lambda x: "%s - vb" % x,
 $}[OP]
-$SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION]
-$PARAMS = {"LINEAR": "xnn_f32_default_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION]
+$SUFFIX = {"LINEAR": "", "RELU": "_relu", "MINMAX": "_minmax"}[ACTIVATION]
+$PARAMS = {"LINEAR": "xnn_f32_default_params", "RELU": "xnn_f32_relu_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION]
 void xnn_f32_v${OP.lower()}c${SUFFIX}_ukernel__${"wasm" if WASM else "scalar"}_x${BATCH_TILE}(
     size_t n,
     const float* a,
@@ -66,6 +66,9 @@
 
         $for N in range(BATCH_TILE):
           vy${ABC[N]} = ${MIN_F32}(vy${ABC[N]}, vy_max);
+      $elif ACTIVATION == "RELU":
+        $for N in range(BATCH_TILE):
+          vy${ABC[N]} = ${MAX_F32}(vy${ABC[N]}, 0.0f);
 
       $for N in range(BATCH_TILE):
         y[${N}] = vy${ABC[N]};
@@ -81,6 +84,8 @@
           $if ACTIVATION == "MINMAX":
             vy = ${MAX_F32}(vy, vy_min);
             vy = ${MIN_F32}(vy, vy_max);
+          $elif ACTIVATION == "RELU":
+            vy = ${MAX_F32}(vy, 0.0f);
           *y++ = vy;
           n -= sizeof(float);
         } while (n != 0);
@@ -92,6 +97,8 @@
         $if ACTIVATION == "MINMAX":
           vy = ${MAX_F32}(vy, vy_min);
           vy = ${MIN_F32}(vy, vy_max);
+        $elif ACTIVATION == "RELU":
+          vy = ${MAX_F32}(vy, 0.0f);
         *y = vy;
     }
   $else:
@@ -103,6 +110,8 @@
       $if ACTIVATION == "MINMAX":
         vy = ${MAX_F32}(vy, vy_min);
         vy = ${MIN_F32}(vy, vy_max);
+      $elif ACTIVATION == "RELU":
+        vy = ${MAX_F32}(vy, 0.0f);
       *y++ = vy;
     }
 }
diff --git a/src/f32-vbinary/vopc-wasmsimd.c.in b/src/f32-vbinary/vopc-wasmsimd.c.in
index 5cc437d..c8ba8a9 100644
--- a/src/f32-vbinary/vopc-wasmsimd.c.in
+++ b/src/f32-vbinary/vopc-wasmsimd.c.in
@@ -7,7 +7,7 @@
 $assert BATCH_TILE >= 4
 $ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 $assert OP in ["ADD", "DIV", "RDIV", "MAX", "MIN", "MUL", "SUB", "RSUB", "SQRDIFF"]
-$assert ACTIVATION in ["LINEAR", "MINMAX"]
+$assert ACTIVATION in ["LINEAR", "MINMAX", "RELU"]
 #include <assert.h>
 
 #include <wasm_simd128.h>
@@ -27,9 +27,10 @@
 $  "RSUB": lambda x: "wasm_f32x4_sub(vb, %s)" % x,
 $  "SQRDIFF": lambda x: "wasm_f32x4_sub(%s, vb)" % x,
 $}[OP]
-$ARCH_SUFFIX = "" if ACTIVATION == "LINEAR" and OP not in ["MIN", "MAX"] else "_x86" if X86 else "_arm"
-$ACTIVATION_SUFFIX = {"LINEAR": "", "MINMAX": "_minmax"}[ACTIVATION]
-$PARAMS = {"LINEAR": "xnn_f32_default_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION]
+$assert ACTIVATION in ["LINEAR", "RELU", "MINMAX"]
+$ARCH_SUFFIX = "" if ACTIVATION in ["LINEAR", "RELU"] else "_x86" if X86 else "_arm"
+$ACTIVATION_SUFFIX = {"LINEAR": ""}.get(ACTIVATION, "_" + ACTIVATION.lower())
+$PARAMS = {"LINEAR": "xnn_f32_default_params", "RELU": "xnn_f32_relu_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION]
 void xnn_f32_v${OP.lower()}c${ACTIVATION_SUFFIX}_ukernel__wasmsimd${ARCH_SUFFIX}_x${BATCH_TILE}(
     size_t n,
     const float* a,
@@ -46,7 +47,8 @@
   $if ACTIVATION == "MINMAX":
     const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
     const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
-
+  $elif ACTIVATION == "RELU":
+    const v128_t vzero = wasm_f32x4_splat(0.0f);
   const v128_t vb = wasm_v32x4_load_splat(b);
   for (; n >= ${BATCH_TILE} * sizeof(float); n -= ${BATCH_TILE} * sizeof(float)) {
     const v128_t va${ABC[0:4]} = wasm_v128_load(a);
@@ -91,6 +93,9 @@
 
         $for N in range(0, BATCH_TILE, 4):
           vy${ABC[N:N+4]} = wasm_f32x4_min(vy${ABC[N:N+4]}, vy_max);
+    $elif ACTIVATION == "RELU":
+      $for N in range(0, BATCH_TILE, 4):
+        vy${ABC[N:N+4]} = wasm_i32x4_max(vy${ABC[N:N+4]}, vzero);
 
     wasm_v128_store(y, vy${ABC[0:4]});
     $for N in range(4, BATCH_TILE, 4):
@@ -122,6 +127,8 @@
         $else:
           vy = wasm_f32x4_max(vy, vy_min);
           vy = wasm_f32x4_min(vy, vy_max);
+      $elif ACTIVATION == "RELU":
+        vy = wasm_i32x4_max(vy, vzero);
 
       wasm_v128_store(y, vy);
       y += 4;
@@ -149,6 +156,8 @@
       $else:
         vy = wasm_f32x4_max(vy, vy_min);
         vy = wasm_f32x4_min(vy, vy_max);
+    $elif ACTIVATION == "RELU":
+      vy = wasm_i32x4_max(vy, vzero);
 
     if (n & (2 * sizeof(float))) {
       *((double*) y) = wasm_f64x2_extract_lane(vy, 0);