Add binary op microkernels with RELU activation

PiperOrigin-RevId: 325607697
diff --git a/src/f32-vbinary/gen/vadd-relu-scalar-x1.c b/src/f32-vbinary/gen/vadd-relu-scalar-x1.c
new file mode 100644
index 0000000..d701332
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va + vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-scalar-x2.c b/src/f32-vbinary/gen/vadd-relu-scalar-x2.c
new file mode 100644
index 0000000..8feca13
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-scalar-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va + vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-scalar-x4.c b/src/f32-vbinary/gen/vadd-relu-scalar-x4.c
new file mode 100644
index 0000000..11131c0
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-scalar-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+    float vy2 = va2 + vb2;
+    float vy3 = va3 + vb3;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va + vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-wasm-x1.c b/src/f32-vbinary/gen/vadd-relu-wasm-x1.c
new file mode 100644
index 0000000..45bbe36
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va + vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-wasm-x2.c b/src/f32-vbinary/gen/vadd-relu-wasm-x2.c
new file mode 100644
index 0000000..ab69037
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-wasm-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va + vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-wasm-x4.c b/src/f32-vbinary/gen/vadd-relu-wasm-x4.c
new file mode 100644
index 0000000..fd776af
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-wasm-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+    float vy2 = va2 + vb2;
+    float vy3 = va3 + vb3;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va + vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vadd-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..2710ce0
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-wasmsimd-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb0123);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vadd-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..10d58bb
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-wasmsimd-x8.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    b += 8;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_add(va4567, vb4567);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-scalar-x1.c b/src/f32-vbinary/gen/vaddc-relu-scalar-x1.c
new file mode 100644
index 0000000..309398e
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va + vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-scalar-x2.c b/src/f32-vbinary/gen/vaddc-relu-scalar-x2.c
new file mode 100644
index 0000000..2247e79
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-scalar-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va + vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-scalar-x4.c b/src/f32-vbinary/gen/vaddc-relu-scalar-x4.c
new file mode 100644
index 0000000..36ea948
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-scalar-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+    float vy2 = va2 + vb;
+    float vy3 = va3 + vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va + vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-wasm-x1.c b/src/f32-vbinary/gen/vaddc-relu-wasm-x1.c
new file mode 100644
index 0000000..de88639
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va + vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-wasm-x2.c b/src/f32-vbinary/gen/vaddc-relu-wasm-x2.c
new file mode 100644
index 0000000..638d8c5
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-wasm-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va + vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-wasm-x4.c b/src/f32-vbinary/gen/vaddc-relu-wasm-x4.c
new file mode 100644
index 0000000..275891b
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-wasm-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+    float vy2 = va2 + vb;
+    float vy3 = va3 + vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va + vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..e1f7fdf
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x4.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..eb52c12
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_add(va4567, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-scalar-x1.c b/src/f32-vbinary/gen/vdiv-relu-scalar-x1.c
new file mode 100644
index 0000000..fe1c875
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va / vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-scalar-x2.c b/src/f32-vbinary/gen/vdiv-relu-scalar-x2.c
new file mode 100644
index 0000000..ce988e4
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-scalar-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va / vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-scalar-x4.c b/src/f32-vbinary/gen/vdiv-relu-scalar-x4.c
new file mode 100644
index 0000000..db115d3
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-scalar-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+    float vy2 = va2 / vb2;
+    float vy3 = va3 / vb3;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va / vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-wasm-x1.c b/src/f32-vbinary/gen/vdiv-relu-wasm-x1.c
new file mode 100644
index 0000000..94ef79e
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va / vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-wasm-x2.c b/src/f32-vbinary/gen/vdiv-relu-wasm-x2.c
new file mode 100644
index 0000000..4e70db1
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-wasm-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va / vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-wasm-x4.c b/src/f32-vbinary/gen/vdiv-relu-wasm-x4.c
new file mode 100644
index 0000000..6683ed4
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-wasm-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+    float vy2 = va2 / vb2;
+    float vy3 = va3 / vb3;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va / vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..0f6ad66
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb0123);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..95d66a9
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x8.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    b += 8;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_div(va4567, vb4567);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-scalar-x1.c b/src/f32-vbinary/gen/vdivc-relu-scalar-x1.c
new file mode 100644
index 0000000..73f22a7
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va / vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-scalar-x2.c b/src/f32-vbinary/gen/vdivc-relu-scalar-x2.c
new file mode 100644
index 0000000..9f27717
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-scalar-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va / vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-scalar-x4.c b/src/f32-vbinary/gen/vdivc-relu-scalar-x4.c
new file mode 100644
index 0000000..74bf59b
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-scalar-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+    float vy2 = va2 / vb;
+    float vy3 = va3 / vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va / vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-wasm-x1.c b/src/f32-vbinary/gen/vdivc-relu-wasm-x1.c
new file mode 100644
index 0000000..3c917ae
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va / vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-wasm-x2.c b/src/f32-vbinary/gen/vdivc-relu-wasm-x2.c
new file mode 100644
index 0000000..1f902ae
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-wasm-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va / vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-wasm-x4.c b/src/f32-vbinary/gen/vdivc-relu-wasm-x4.c
new file mode 100644
index 0000000..8e3f484
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-wasm-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+    float vy2 = va2 / vb;
+    float vy3 = va3 / vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va / vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..6567e78
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x4.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..ad68670
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_div(va4567, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-scalar-x1.c b/src/f32-vbinary/gen/vmul-relu-scalar-x1.c
new file mode 100644
index 0000000..29622c6
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va * vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-scalar-x2.c b/src/f32-vbinary/gen/vmul-relu-scalar-x2.c
new file mode 100644
index 0000000..48f6bcf
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-scalar-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va * vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-scalar-x4.c b/src/f32-vbinary/gen/vmul-relu-scalar-x4.c
new file mode 100644
index 0000000..d0acbb8
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-scalar-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+    float vy2 = va2 * vb2;
+    float vy3 = va3 * vb3;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va * vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-wasm-x1.c b/src/f32-vbinary/gen/vmul-relu-wasm-x1.c
new file mode 100644
index 0000000..2d34fed
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va * vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-wasm-x2.c b/src/f32-vbinary/gen/vmul-relu-wasm-x2.c
new file mode 100644
index 0000000..d6f8677
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-wasm-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va * vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-wasm-x4.c b/src/f32-vbinary/gen/vmul-relu-wasm-x4.c
new file mode 100644
index 0000000..20471db
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-wasm-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+    float vy2 = va2 * vb2;
+    float vy3 = va3 * vb3;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va * vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vmul-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..e3b236a
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-wasmsimd-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb0123);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vmul-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..2fe6699
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-wasmsimd-x8.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    b += 8;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_mul(va4567, vb4567);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-scalar-x1.c b/src/f32-vbinary/gen/vmulc-relu-scalar-x1.c
new file mode 100644
index 0000000..26b126d
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va * vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-scalar-x2.c b/src/f32-vbinary/gen/vmulc-relu-scalar-x2.c
new file mode 100644
index 0000000..0a81915
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-scalar-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va * vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-scalar-x4.c b/src/f32-vbinary/gen/vmulc-relu-scalar-x4.c
new file mode 100644
index 0000000..4e52ae1
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-scalar-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+    float vy2 = va2 * vb;
+    float vy3 = va3 * vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va * vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-wasm-x1.c b/src/f32-vbinary/gen/vmulc-relu-wasm-x1.c
new file mode 100644
index 0000000..b16bbb6
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va * vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-wasm-x2.c b/src/f32-vbinary/gen/vmulc-relu-wasm-x2.c
new file mode 100644
index 0000000..ef4d8b6
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-wasm-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va * vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-wasm-x4.c b/src/f32-vbinary/gen/vmulc-relu-wasm-x4.c
new file mode 100644
index 0000000..1167faa
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-wasm-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+    float vy2 = va2 * vb;
+    float vy3 = va3 * vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va * vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..54f6fcb
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x4.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..7d10ad8
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_mul(va4567, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-scalar-x1.c b/src/f32-vbinary/gen/vrdivc-relu-scalar-x1.c
new file mode 100644
index 0000000..3c8adb2
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = vb / va;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-scalar-x2.c b/src/f32-vbinary/gen/vrdivc-relu-scalar-x2.c
new file mode 100644
index 0000000..965c307
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-scalar-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = vb / va;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-scalar-x4.c b/src/f32-vbinary/gen/vrdivc-relu-scalar-x4.c
new file mode 100644
index 0000000..a685c0f
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-scalar-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+    float vy2 = vb / va2;
+    float vy3 = vb / va3;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb / va;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-wasm-x1.c b/src/f32-vbinary/gen/vrdivc-relu-wasm-x1.c
new file mode 100644
index 0000000..a181ad6
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = vb / va;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-wasm-x2.c b/src/f32-vbinary/gen/vrdivc-relu-wasm-x2.c
new file mode 100644
index 0000000..8fbd18b
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-wasm-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = vb / va;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-wasm-x4.c b/src/f32-vbinary/gen/vrdivc-relu-wasm-x4.c
new file mode 100644
index 0000000..b866a19
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-wasm-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+    float vy2 = vb / va2;
+    float vy3 = vb / va3;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb / va;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..15ee424
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x4.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy0123 = wasm_f32x4_div(vb, va0123);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..5affb62
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    v128_t vy0123 = wasm_f32x4_div(vb, va0123);
+    v128_t vy4567 = wasm_f32x4_div(vb, va4567);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-scalar-x1.c b/src/f32-vbinary/gen/vrsubc-relu-scalar-x1.c
new file mode 100644
index 0000000..0ab4b6e
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = vb - va;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-scalar-x2.c b/src/f32-vbinary/gen/vrsubc-relu-scalar-x2.c
new file mode 100644
index 0000000..faf11a2
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-scalar-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = vb - va;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-scalar-x4.c b/src/f32-vbinary/gen/vrsubc-relu-scalar-x4.c
new file mode 100644
index 0000000..8408572
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-scalar-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+    float vy2 = vb - va2;
+    float vy3 = vb - va3;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb - va;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-wasm-x1.c b/src/f32-vbinary/gen/vrsubc-relu-wasm-x1.c
new file mode 100644
index 0000000..37e921f
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = vb - va;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-wasm-x2.c b/src/f32-vbinary/gen/vrsubc-relu-wasm-x2.c
new file mode 100644
index 0000000..e873111
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-wasm-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = vb - va;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-wasm-x4.c b/src/f32-vbinary/gen/vrsubc-relu-wasm-x4.c
new file mode 100644
index 0000000..0aa0d11
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-wasm-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+    float vy2 = vb - va2;
+    float vy3 = vb - va3;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb - va;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..4ea3d1a
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x4.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy0123 = wasm_f32x4_sub(vb, va0123);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..b50a4ac
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    v128_t vy0123 = wasm_f32x4_sub(vb, va0123);
+    v128_t vy4567 = wasm_f32x4_sub(vb, va4567);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-scalar-x1.c b/src/f32-vbinary/gen/vsub-relu-scalar-x1.c
new file mode 100644
index 0000000..2aa104f
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va - vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-scalar-x2.c b/src/f32-vbinary/gen/vsub-relu-scalar-x2.c
new file mode 100644
index 0000000..b09f50f
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-scalar-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va - vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-scalar-x4.c b/src/f32-vbinary/gen/vsub-relu-scalar-x4.c
new file mode 100644
index 0000000..3536ae9
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-scalar-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+    float vy2 = va2 - vb2;
+    float vy3 = va3 - vb3;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va - vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-wasm-x1.c b/src/f32-vbinary/gen/vsub-relu-wasm-x1.c
new file mode 100644
index 0000000..6d8de30
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va - vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-wasm-x2.c b/src/f32-vbinary/gen/vsub-relu-wasm-x2.c
new file mode 100644
index 0000000..8746a1a
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-wasm-x2.c
@@ -0,0 +1,58 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va - vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-wasm-x4.c b/src/f32-vbinary/gen/vsub-relu-wasm-x4.c
new file mode 100644
index 0000000..4b5ea98
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-wasm-x4.c
@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+    float vy2 = va2 - vb2;
+    float vy3 = va3 - vb3;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va - vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vsub-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..a30a889
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-wasmsimd-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb0123);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vsub-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..a69394e
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-wasmsimd-x8.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    b += 8;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb4567);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-scalar-x1.c b/src/f32-vbinary/gen/vsubc-relu-scalar-x1.c
new file mode 100644
index 0000000..3ebb7f3
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-scalar-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__scalar_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va - vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-scalar-x2.c b/src/f32-vbinary/gen/vsubc-relu-scalar-x2.c
new file mode 100644
index 0000000..ab12102
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-scalar-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__scalar_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va - vb;
+    vy = math_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-scalar-x4.c b/src/f32-vbinary/gen/vsubc-relu-scalar-x4.c
new file mode 100644
index 0000000..db29dcf
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-scalar-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__scalar_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+    float vy2 = va2 - vb;
+    float vy3 = va3 - vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va - vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-wasm-x1.c b/src/f32-vbinary/gen/vsubc-relu-wasm-x1.c
new file mode 100644
index 0000000..6b26df9
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-wasm-x1.c
@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va - vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y++ = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-wasm-x2.c b/src/f32-vbinary/gen/vsubc-relu-wasm-x2.c
new file mode 100644
index 0000000..1140931
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-wasm-x2.c
@@ -0,0 +1,54 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va - vb;
+    vy = __builtin_wasm_max_f32(vy, 0.0f);
+    *y = vy;
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-wasm-x4.c b/src/f32-vbinary/gen/vsubc-relu-wasm-x4.c
new file mode 100644
index 0000000..7e9485c
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-wasm-x4.c
@@ -0,0 +1,65 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+    float vy2 = va2 - vb;
+    float vy3 = va3 - vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va - vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x4.c b/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x4.c
new file mode 100644
index 0000000..02154be
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x4.c
@@ -0,0 +1,61 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__wasmsimd_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+
+    wasm_v128_store(y, vy0123);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x8.c b/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x8.c
new file mode 100644
index 0000000..4cc1893
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x8.c
@@ -0,0 +1,76 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__wasmsimd_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    a += 8;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    y += 8;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}