vbinary microkernels unrolled to x8 for scalar and web assembly and x16 web assembly simd

PiperOrigin-RevId: 329407843
diff --git a/BUILD.bazel b/BUILD.bazel
index 29b14d7..2fc11ce 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -239,111 +239,147 @@
     "src/f32-vbinary/gen/vadd-minmax-scalar-x1.c",
     "src/f32-vbinary/gen/vadd-minmax-scalar-x2.c",
     "src/f32-vbinary/gen/vadd-minmax-scalar-x4.c",
+    "src/f32-vbinary/gen/vadd-minmax-scalar-x8.c",
     "src/f32-vbinary/gen/vadd-relu-scalar-x1.c",
     "src/f32-vbinary/gen/vadd-relu-scalar-x2.c",
     "src/f32-vbinary/gen/vadd-relu-scalar-x4.c",
+    "src/f32-vbinary/gen/vadd-relu-scalar-x8.c",
     "src/f32-vbinary/gen/vadd-scalar-x1.c",
     "src/f32-vbinary/gen/vadd-scalar-x2.c",
     "src/f32-vbinary/gen/vadd-scalar-x4.c",
+    "src/f32-vbinary/gen/vadd-scalar-x8.c",
     "src/f32-vbinary/gen/vaddc-minmax-scalar-x1.c",
     "src/f32-vbinary/gen/vaddc-minmax-scalar-x2.c",
     "src/f32-vbinary/gen/vaddc-minmax-scalar-x4.c",
+    "src/f32-vbinary/gen/vaddc-minmax-scalar-x8.c",
     "src/f32-vbinary/gen/vaddc-relu-scalar-x1.c",
     "src/f32-vbinary/gen/vaddc-relu-scalar-x2.c",
     "src/f32-vbinary/gen/vaddc-relu-scalar-x4.c",
+    "src/f32-vbinary/gen/vaddc-relu-scalar-x8.c",
     "src/f32-vbinary/gen/vaddc-scalar-x1.c",
     "src/f32-vbinary/gen/vaddc-scalar-x2.c",
     "src/f32-vbinary/gen/vaddc-scalar-x4.c",
+    "src/f32-vbinary/gen/vaddc-scalar-x8.c",
     "src/f32-vbinary/gen/vdiv-minmax-scalar-x1.c",
     "src/f32-vbinary/gen/vdiv-minmax-scalar-x2.c",
     "src/f32-vbinary/gen/vdiv-minmax-scalar-x4.c",
+    "src/f32-vbinary/gen/vdiv-minmax-scalar-x8.c",
     "src/f32-vbinary/gen/vdiv-relu-scalar-x1.c",
     "src/f32-vbinary/gen/vdiv-relu-scalar-x2.c",
     "src/f32-vbinary/gen/vdiv-relu-scalar-x4.c",
+    "src/f32-vbinary/gen/vdiv-relu-scalar-x8.c",
     "src/f32-vbinary/gen/vdiv-scalar-x1.c",
     "src/f32-vbinary/gen/vdiv-scalar-x2.c",
     "src/f32-vbinary/gen/vdiv-scalar-x4.c",
+    "src/f32-vbinary/gen/vdiv-scalar-x8.c",
     "src/f32-vbinary/gen/vdivc-minmax-scalar-x1.c",
     "src/f32-vbinary/gen/vdivc-minmax-scalar-x2.c",
     "src/f32-vbinary/gen/vdivc-minmax-scalar-x4.c",
+    "src/f32-vbinary/gen/vdivc-minmax-scalar-x8.c",
     "src/f32-vbinary/gen/vdivc-relu-scalar-x1.c",
     "src/f32-vbinary/gen/vdivc-relu-scalar-x2.c",
     "src/f32-vbinary/gen/vdivc-relu-scalar-x4.c",
+    "src/f32-vbinary/gen/vdivc-relu-scalar-x8.c",
     "src/f32-vbinary/gen/vdivc-scalar-x1.c",
     "src/f32-vbinary/gen/vdivc-scalar-x2.c",
     "src/f32-vbinary/gen/vdivc-scalar-x4.c",
+    "src/f32-vbinary/gen/vdivc-scalar-x8.c",
     "src/f32-vbinary/gen/vmax-scalar-x1.c",
     "src/f32-vbinary/gen/vmax-scalar-x2.c",
     "src/f32-vbinary/gen/vmax-scalar-x4.c",
+    "src/f32-vbinary/gen/vmax-scalar-x8.c",
     "src/f32-vbinary/gen/vmaxc-scalar-x1.c",
     "src/f32-vbinary/gen/vmaxc-scalar-x2.c",
     "src/f32-vbinary/gen/vmaxc-scalar-x4.c",
+    "src/f32-vbinary/gen/vmaxc-scalar-x8.c",
     "src/f32-vbinary/gen/vmin-scalar-x1.c",
     "src/f32-vbinary/gen/vmin-scalar-x2.c",
     "src/f32-vbinary/gen/vmin-scalar-x4.c",
+    "src/f32-vbinary/gen/vmin-scalar-x8.c",
     "src/f32-vbinary/gen/vminc-scalar-x1.c",
     "src/f32-vbinary/gen/vminc-scalar-x2.c",
     "src/f32-vbinary/gen/vminc-scalar-x4.c",
+    "src/f32-vbinary/gen/vminc-scalar-x8.c",
     "src/f32-vbinary/gen/vmul-minmax-scalar-x1.c",
     "src/f32-vbinary/gen/vmul-minmax-scalar-x2.c",
     "src/f32-vbinary/gen/vmul-minmax-scalar-x4.c",
+    "src/f32-vbinary/gen/vmul-minmax-scalar-x8.c",
     "src/f32-vbinary/gen/vmul-relu-scalar-x1.c",
     "src/f32-vbinary/gen/vmul-relu-scalar-x2.c",
     "src/f32-vbinary/gen/vmul-relu-scalar-x4.c",
+    "src/f32-vbinary/gen/vmul-relu-scalar-x8.c",
     "src/f32-vbinary/gen/vmul-scalar-x1.c",
     "src/f32-vbinary/gen/vmul-scalar-x2.c",
     "src/f32-vbinary/gen/vmul-scalar-x4.c",
+    "src/f32-vbinary/gen/vmul-scalar-x8.c",
     "src/f32-vbinary/gen/vmulc-minmax-scalar-x1.c",
     "src/f32-vbinary/gen/vmulc-minmax-scalar-x2.c",
     "src/f32-vbinary/gen/vmulc-minmax-scalar-x4.c",
+    "src/f32-vbinary/gen/vmulc-minmax-scalar-x8.c",
     "src/f32-vbinary/gen/vmulc-relu-scalar-x1.c",
     "src/f32-vbinary/gen/vmulc-relu-scalar-x2.c",
     "src/f32-vbinary/gen/vmulc-relu-scalar-x4.c",
+    "src/f32-vbinary/gen/vmulc-relu-scalar-x8.c",
     "src/f32-vbinary/gen/vmulc-scalar-x1.c",
     "src/f32-vbinary/gen/vmulc-scalar-x2.c",
     "src/f32-vbinary/gen/vmulc-scalar-x4.c",
+    "src/f32-vbinary/gen/vmulc-scalar-x8.c",
     "src/f32-vbinary/gen/vrdivc-minmax-scalar-x1.c",
     "src/f32-vbinary/gen/vrdivc-minmax-scalar-x2.c",
     "src/f32-vbinary/gen/vrdivc-minmax-scalar-x4.c",
+    "src/f32-vbinary/gen/vrdivc-minmax-scalar-x8.c",
     "src/f32-vbinary/gen/vrdivc-relu-scalar-x1.c",
     "src/f32-vbinary/gen/vrdivc-relu-scalar-x2.c",
     "src/f32-vbinary/gen/vrdivc-relu-scalar-x4.c",
+    "src/f32-vbinary/gen/vrdivc-relu-scalar-x8.c",
     "src/f32-vbinary/gen/vrdivc-scalar-x1.c",
     "src/f32-vbinary/gen/vrdivc-scalar-x2.c",
     "src/f32-vbinary/gen/vrdivc-scalar-x4.c",
+    "src/f32-vbinary/gen/vrdivc-scalar-x8.c",
     "src/f32-vbinary/gen/vrsubc-minmax-scalar-x1.c",
     "src/f32-vbinary/gen/vrsubc-minmax-scalar-x2.c",
     "src/f32-vbinary/gen/vrsubc-minmax-scalar-x4.c",
+    "src/f32-vbinary/gen/vrsubc-minmax-scalar-x8.c",
     "src/f32-vbinary/gen/vrsubc-relu-scalar-x1.c",
     "src/f32-vbinary/gen/vrsubc-relu-scalar-x2.c",
     "src/f32-vbinary/gen/vrsubc-relu-scalar-x4.c",
+    "src/f32-vbinary/gen/vrsubc-relu-scalar-x8.c",
     "src/f32-vbinary/gen/vrsubc-scalar-x1.c",
     "src/f32-vbinary/gen/vrsubc-scalar-x2.c",
     "src/f32-vbinary/gen/vrsubc-scalar-x4.c",
+    "src/f32-vbinary/gen/vrsubc-scalar-x8.c",
     "src/f32-vbinary/gen/vsqrdiff-scalar-x1.c",
     "src/f32-vbinary/gen/vsqrdiff-scalar-x2.c",
     "src/f32-vbinary/gen/vsqrdiff-scalar-x4.c",
+    "src/f32-vbinary/gen/vsqrdiff-scalar-x8.c",
     "src/f32-vbinary/gen/vsqrdiffc-scalar-x1.c",
     "src/f32-vbinary/gen/vsqrdiffc-scalar-x2.c",
     "src/f32-vbinary/gen/vsqrdiffc-scalar-x4.c",
+    "src/f32-vbinary/gen/vsqrdiffc-scalar-x8.c",
     "src/f32-vbinary/gen/vsub-minmax-scalar-x1.c",
     "src/f32-vbinary/gen/vsub-minmax-scalar-x2.c",
     "src/f32-vbinary/gen/vsub-minmax-scalar-x4.c",
+    "src/f32-vbinary/gen/vsub-minmax-scalar-x8.c",
     "src/f32-vbinary/gen/vsub-relu-scalar-x1.c",
     "src/f32-vbinary/gen/vsub-relu-scalar-x2.c",
     "src/f32-vbinary/gen/vsub-relu-scalar-x4.c",
+    "src/f32-vbinary/gen/vsub-relu-scalar-x8.c",
     "src/f32-vbinary/gen/vsub-scalar-x1.c",
     "src/f32-vbinary/gen/vsub-scalar-x2.c",
     "src/f32-vbinary/gen/vsub-scalar-x4.c",
+    "src/f32-vbinary/gen/vsub-scalar-x8.c",
     "src/f32-vbinary/gen/vsubc-minmax-scalar-x1.c",
     "src/f32-vbinary/gen/vsubc-minmax-scalar-x2.c",
     "src/f32-vbinary/gen/vsubc-minmax-scalar-x4.c",
+    "src/f32-vbinary/gen/vsubc-minmax-scalar-x8.c",
     "src/f32-vbinary/gen/vsubc-relu-scalar-x1.c",
     "src/f32-vbinary/gen/vsubc-relu-scalar-x2.c",
     "src/f32-vbinary/gen/vsubc-relu-scalar-x4.c",
+    "src/f32-vbinary/gen/vsubc-relu-scalar-x8.c",
     "src/f32-vbinary/gen/vsubc-scalar-x1.c",
     "src/f32-vbinary/gen/vsubc-scalar-x2.c",
     "src/f32-vbinary/gen/vsubc-scalar-x4.c",
+    "src/f32-vbinary/gen/vsubc-scalar-x8.c",
     "src/f32-vmulcaddc/gen/c1-minmax-scalar-2x.c",
     "src/f32-vmulcaddc/gen/c2-minmax-scalar-2x.c",
     "src/f32-vmulcaddc/gen/c4-minmax-scalar-2x.c",
@@ -510,75 +546,99 @@
     "src/f32-vbinary/gen/vadd-minmax-wasm-x1.c",
     "src/f32-vbinary/gen/vadd-minmax-wasm-x2.c",
     "src/f32-vbinary/gen/vadd-minmax-wasm-x4.c",
+    "src/f32-vbinary/gen/vadd-minmax-wasm-x8.c",
     "src/f32-vbinary/gen/vaddc-minmax-wasm-x1.c",
     "src/f32-vbinary/gen/vaddc-minmax-wasm-x2.c",
     "src/f32-vbinary/gen/vaddc-minmax-wasm-x4.c",
+    "src/f32-vbinary/gen/vaddc-minmax-wasm-x8.c",
     "src/f32-vbinary/gen/vdiv-minmax-wasm-x1.c",
     "src/f32-vbinary/gen/vdiv-minmax-wasm-x2.c",
     "src/f32-vbinary/gen/vdiv-minmax-wasm-x4.c",
+    "src/f32-vbinary/gen/vdiv-minmax-wasm-x8.c",
     "src/f32-vbinary/gen/vdivc-minmax-wasm-x1.c",
     "src/f32-vbinary/gen/vdivc-minmax-wasm-x2.c",
     "src/f32-vbinary/gen/vdivc-minmax-wasm-x4.c",
+    "src/f32-vbinary/gen/vdivc-minmax-wasm-x8.c",
     "src/f32-vbinary/gen/vadd-relu-wasm-x1.c",
     "src/f32-vbinary/gen/vadd-relu-wasm-x2.c",
     "src/f32-vbinary/gen/vadd-relu-wasm-x4.c",
+    "src/f32-vbinary/gen/vadd-relu-wasm-x8.c",
     "src/f32-vbinary/gen/vaddc-relu-wasm-x1.c",
     "src/f32-vbinary/gen/vaddc-relu-wasm-x2.c",
     "src/f32-vbinary/gen/vaddc-relu-wasm-x4.c",
+    "src/f32-vbinary/gen/vaddc-relu-wasm-x8.c",
     "src/f32-vbinary/gen/vdiv-relu-wasm-x1.c",
     "src/f32-vbinary/gen/vdiv-relu-wasm-x2.c",
     "src/f32-vbinary/gen/vdiv-relu-wasm-x4.c",
+    "src/f32-vbinary/gen/vdiv-relu-wasm-x8.c",
     "src/f32-vbinary/gen/vdivc-relu-wasm-x1.c",
     "src/f32-vbinary/gen/vdivc-relu-wasm-x2.c",
     "src/f32-vbinary/gen/vdivc-relu-wasm-x4.c",
+    "src/f32-vbinary/gen/vdivc-relu-wasm-x8.c",
     "src/f32-vbinary/gen/vmax-wasm-x1.c",
     "src/f32-vbinary/gen/vmax-wasm-x2.c",
     "src/f32-vbinary/gen/vmax-wasm-x4.c",
+    "src/f32-vbinary/gen/vmax-wasm-x8.c",
     "src/f32-vbinary/gen/vmaxc-wasm-x1.c",
     "src/f32-vbinary/gen/vmaxc-wasm-x2.c",
     "src/f32-vbinary/gen/vmaxc-wasm-x4.c",
+    "src/f32-vbinary/gen/vmaxc-wasm-x8.c",
     "src/f32-vbinary/gen/vmin-wasm-x1.c",
     "src/f32-vbinary/gen/vmin-wasm-x2.c",
     "src/f32-vbinary/gen/vmin-wasm-x4.c",
+    "src/f32-vbinary/gen/vmin-wasm-x8.c",
     "src/f32-vbinary/gen/vminc-wasm-x1.c",
     "src/f32-vbinary/gen/vminc-wasm-x2.c",
     "src/f32-vbinary/gen/vminc-wasm-x4.c",
+    "src/f32-vbinary/gen/vminc-wasm-x8.c",
     "src/f32-vbinary/gen/vmul-minmax-wasm-x1.c",
     "src/f32-vbinary/gen/vmul-minmax-wasm-x2.c",
     "src/f32-vbinary/gen/vmul-minmax-wasm-x4.c",
+    "src/f32-vbinary/gen/vmul-minmax-wasm-x8.c",
     "src/f32-vbinary/gen/vmulc-minmax-wasm-x1.c",
     "src/f32-vbinary/gen/vmulc-minmax-wasm-x2.c",
     "src/f32-vbinary/gen/vmulc-minmax-wasm-x4.c",
+    "src/f32-vbinary/gen/vmulc-minmax-wasm-x8.c",
     "src/f32-vbinary/gen/vrdivc-minmax-wasm-x1.c",
     "src/f32-vbinary/gen/vrdivc-minmax-wasm-x2.c",
     "src/f32-vbinary/gen/vrdivc-minmax-wasm-x4.c",
+    "src/f32-vbinary/gen/vrdivc-minmax-wasm-x8.c",
     "src/f32-vbinary/gen/vrsubc-minmax-wasm-x1.c",
     "src/f32-vbinary/gen/vrsubc-minmax-wasm-x2.c",
     "src/f32-vbinary/gen/vrsubc-minmax-wasm-x4.c",
+    "src/f32-vbinary/gen/vrsubc-minmax-wasm-x8.c",
     "src/f32-vbinary/gen/vsub-minmax-wasm-x1.c",
     "src/f32-vbinary/gen/vsub-minmax-wasm-x2.c",
     "src/f32-vbinary/gen/vsub-minmax-wasm-x4.c",
+    "src/f32-vbinary/gen/vsub-minmax-wasm-x8.c",
     "src/f32-vbinary/gen/vsubc-minmax-wasm-x1.c",
     "src/f32-vbinary/gen/vsubc-minmax-wasm-x2.c",
     "src/f32-vbinary/gen/vsubc-minmax-wasm-x4.c",
+    "src/f32-vbinary/gen/vsubc-minmax-wasm-x8.c",
     "src/f32-vbinary/gen/vmul-relu-wasm-x1.c",
     "src/f32-vbinary/gen/vmul-relu-wasm-x2.c",
     "src/f32-vbinary/gen/vmul-relu-wasm-x4.c",
+    "src/f32-vbinary/gen/vmul-relu-wasm-x8.c",
     "src/f32-vbinary/gen/vmulc-relu-wasm-x1.c",
     "src/f32-vbinary/gen/vmulc-relu-wasm-x2.c",
     "src/f32-vbinary/gen/vmulc-relu-wasm-x4.c",
+    "src/f32-vbinary/gen/vmulc-relu-wasm-x8.c",
     "src/f32-vbinary/gen/vrdivc-relu-wasm-x1.c",
     "src/f32-vbinary/gen/vrdivc-relu-wasm-x2.c",
     "src/f32-vbinary/gen/vrdivc-relu-wasm-x4.c",
+    "src/f32-vbinary/gen/vrdivc-relu-wasm-x8.c",
     "src/f32-vbinary/gen/vrsubc-relu-wasm-x1.c",
     "src/f32-vbinary/gen/vrsubc-relu-wasm-x2.c",
     "src/f32-vbinary/gen/vrsubc-relu-wasm-x4.c",
+    "src/f32-vbinary/gen/vrsubc-relu-wasm-x8.c",
     "src/f32-vbinary/gen/vsub-relu-wasm-x1.c",
     "src/f32-vbinary/gen/vsub-relu-wasm-x2.c",
     "src/f32-vbinary/gen/vsub-relu-wasm-x4.c",
+    "src/f32-vbinary/gen/vsub-relu-wasm-x8.c",
     "src/f32-vbinary/gen/vsubc-relu-wasm-x1.c",
     "src/f32-vbinary/gen/vsubc-relu-wasm-x2.c",
     "src/f32-vbinary/gen/vsubc-relu-wasm-x4.c",
+    "src/f32-vbinary/gen/vsubc-relu-wasm-x8.c",
     "src/f32-vlrelu/gen/vlrelu-wasm-x1.c",
     "src/f32-vlrelu/gen/vlrelu-wasm-x2.c",
     "src/f32-vlrelu/gen/vlrelu-wasm-x4.c",
@@ -807,104 +867,154 @@
     "src/f32-sigmoid/gen/wasmsimd-lut64-p2-div-x24.c",
     "src/f32-vbinary/gen/vadd-minmax-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vadd-minmax-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vadd-minmax-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vadd-minmax-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vadd-minmax-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vadd-minmax-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vadd-relu-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vadd-relu-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vadd-relu-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vadd-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vadd-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vadd-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vaddc-relu-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vaddc-relu-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vaddc-relu-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vaddc-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vaddc-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vaddc-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vdiv-minmax-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vdiv-minmax-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vdiv-minmax-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vdiv-minmax-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vdiv-minmax-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vdiv-minmax-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vdiv-relu-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vdiv-relu-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vdiv-relu-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vdiv-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vdiv-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vdiv-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vdivc-relu-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vdivc-relu-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vdivc-relu-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vdivc-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vdivc-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vdivc-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vmax-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vmax-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vmax-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vmax-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vmax-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vmax-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vmin-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vmin-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vmin-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vmin-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vmin-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vmin-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vminc-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vminc-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vminc-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vminc-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vminc-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vminc-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vmul-minmax-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vmul-minmax-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vmul-minmax-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vmul-minmax-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vmul-minmax-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vmul-minmax-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vmul-relu-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vmul-relu-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vmul-relu-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vmul-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vmul-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vmul-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vmulc-relu-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vmulc-relu-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vmulc-relu-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vmulc-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vmulc-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vmulc-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vrdivc-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vrdivc-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vrdivc-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vrsubc-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vrsubc-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vrsubc-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vsqrdiff-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vsqrdiff-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vsqrdiff-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vsub-minmax-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vsub-minmax-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vsub-minmax-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vsub-minmax-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vsub-minmax-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vsub-minmax-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vsub-relu-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vsub-relu-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vsub-relu-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vsub-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vsub-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vsub-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x4.c",
     "src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x8.c",
+    "src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x16.c",
     "src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x4.c",
     "src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x8.c",
+    "src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x16.c",
     "src/f32-vbinary/gen/vsubc-relu-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vsubc-relu-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vsubc-relu-wasmsimd-x16.c",
     "src/f32-vbinary/gen/vsubc-wasmsimd-x4.c",
     "src/f32-vbinary/gen/vsubc-wasmsimd-x8.c",
+    "src/f32-vbinary/gen/vsubc-wasmsimd-x16.c",
     "src/f32-vlrelu/gen/vlrelu-wasmsimd-bitselect-x4.c",
     "src/f32-vlrelu/gen/vlrelu-wasmsimd-bitselect-x8.c",
     "src/f32-vlrelu/gen/vlrelu-wasmsimd-minmax-x4.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ad66d07..fe6959f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -362,111 +362,147 @@
   src/f32-vbinary/gen/vadd-minmax-scalar-x1.c
   src/f32-vbinary/gen/vadd-minmax-scalar-x2.c
   src/f32-vbinary/gen/vadd-minmax-scalar-x4.c
+  src/f32-vbinary/gen/vadd-minmax-scalar-x8.c
   src/f32-vbinary/gen/vadd-relu-scalar-x1.c
   src/f32-vbinary/gen/vadd-relu-scalar-x2.c
   src/f32-vbinary/gen/vadd-relu-scalar-x4.c
+  src/f32-vbinary/gen/vadd-relu-scalar-x8.c
   src/f32-vbinary/gen/vadd-scalar-x1.c
   src/f32-vbinary/gen/vadd-scalar-x2.c
   src/f32-vbinary/gen/vadd-scalar-x4.c
+  src/f32-vbinary/gen/vadd-scalar-x8.c
   src/f32-vbinary/gen/vaddc-minmax-scalar-x1.c
   src/f32-vbinary/gen/vaddc-minmax-scalar-x2.c
   src/f32-vbinary/gen/vaddc-minmax-scalar-x4.c
+  src/f32-vbinary/gen/vaddc-minmax-scalar-x8.c
   src/f32-vbinary/gen/vaddc-relu-scalar-x1.c
   src/f32-vbinary/gen/vaddc-relu-scalar-x2.c
   src/f32-vbinary/gen/vaddc-relu-scalar-x4.c
+  src/f32-vbinary/gen/vaddc-relu-scalar-x8.c
   src/f32-vbinary/gen/vaddc-scalar-x1.c
   src/f32-vbinary/gen/vaddc-scalar-x2.c
   src/f32-vbinary/gen/vaddc-scalar-x4.c
+  src/f32-vbinary/gen/vaddc-scalar-x8.c
   src/f32-vbinary/gen/vdiv-minmax-scalar-x1.c
   src/f32-vbinary/gen/vdiv-minmax-scalar-x2.c
   src/f32-vbinary/gen/vdiv-minmax-scalar-x4.c
+  src/f32-vbinary/gen/vdiv-minmax-scalar-x8.c
   src/f32-vbinary/gen/vdiv-relu-scalar-x1.c
   src/f32-vbinary/gen/vdiv-relu-scalar-x2.c
   src/f32-vbinary/gen/vdiv-relu-scalar-x4.c
+  src/f32-vbinary/gen/vdiv-relu-scalar-x8.c
   src/f32-vbinary/gen/vdiv-scalar-x1.c
   src/f32-vbinary/gen/vdiv-scalar-x2.c
   src/f32-vbinary/gen/vdiv-scalar-x4.c
+  src/f32-vbinary/gen/vdiv-scalar-x8.c
   src/f32-vbinary/gen/vdivc-minmax-scalar-x1.c
   src/f32-vbinary/gen/vdivc-minmax-scalar-x2.c
   src/f32-vbinary/gen/vdivc-minmax-scalar-x4.c
+  src/f32-vbinary/gen/vdivc-minmax-scalar-x8.c
   src/f32-vbinary/gen/vdivc-relu-scalar-x1.c
   src/f32-vbinary/gen/vdivc-relu-scalar-x2.c
   src/f32-vbinary/gen/vdivc-relu-scalar-x4.c
+  src/f32-vbinary/gen/vdivc-relu-scalar-x8.c
   src/f32-vbinary/gen/vdivc-scalar-x1.c
   src/f32-vbinary/gen/vdivc-scalar-x2.c
   src/f32-vbinary/gen/vdivc-scalar-x4.c
+  src/f32-vbinary/gen/vdivc-scalar-x8.c
   src/f32-vbinary/gen/vmax-scalar-x1.c
   src/f32-vbinary/gen/vmax-scalar-x2.c
   src/f32-vbinary/gen/vmax-scalar-x4.c
+  src/f32-vbinary/gen/vmax-scalar-x8.c
   src/f32-vbinary/gen/vmaxc-scalar-x1.c
   src/f32-vbinary/gen/vmaxc-scalar-x2.c
   src/f32-vbinary/gen/vmaxc-scalar-x4.c
+  src/f32-vbinary/gen/vmaxc-scalar-x8.c
   src/f32-vbinary/gen/vmin-scalar-x1.c
   src/f32-vbinary/gen/vmin-scalar-x2.c
   src/f32-vbinary/gen/vmin-scalar-x4.c
+  src/f32-vbinary/gen/vmin-scalar-x8.c
   src/f32-vbinary/gen/vminc-scalar-x1.c
   src/f32-vbinary/gen/vminc-scalar-x2.c
   src/f32-vbinary/gen/vminc-scalar-x4.c
+  src/f32-vbinary/gen/vminc-scalar-x8.c
   src/f32-vbinary/gen/vmul-minmax-scalar-x1.c
   src/f32-vbinary/gen/vmul-minmax-scalar-x2.c
   src/f32-vbinary/gen/vmul-minmax-scalar-x4.c
+  src/f32-vbinary/gen/vmul-minmax-scalar-x8.c
   src/f32-vbinary/gen/vmul-relu-scalar-x1.c
   src/f32-vbinary/gen/vmul-relu-scalar-x2.c
   src/f32-vbinary/gen/vmul-relu-scalar-x4.c
+  src/f32-vbinary/gen/vmul-relu-scalar-x8.c
   src/f32-vbinary/gen/vmul-scalar-x1.c
   src/f32-vbinary/gen/vmul-scalar-x2.c
   src/f32-vbinary/gen/vmul-scalar-x4.c
+  src/f32-vbinary/gen/vmul-scalar-x8.c
   src/f32-vbinary/gen/vmulc-minmax-scalar-x1.c
   src/f32-vbinary/gen/vmulc-minmax-scalar-x2.c
   src/f32-vbinary/gen/vmulc-minmax-scalar-x4.c
+  src/f32-vbinary/gen/vmulc-minmax-scalar-x8.c
   src/f32-vbinary/gen/vmulc-relu-scalar-x1.c
   src/f32-vbinary/gen/vmulc-relu-scalar-x2.c
   src/f32-vbinary/gen/vmulc-relu-scalar-x4.c
+  src/f32-vbinary/gen/vmulc-relu-scalar-x8.c
   src/f32-vbinary/gen/vmulc-scalar-x1.c
   src/f32-vbinary/gen/vmulc-scalar-x2.c
   src/f32-vbinary/gen/vmulc-scalar-x4.c
+  src/f32-vbinary/gen/vmulc-scalar-x8.c
   src/f32-vbinary/gen/vrdivc-minmax-scalar-x1.c
   src/f32-vbinary/gen/vrdivc-minmax-scalar-x2.c
   src/f32-vbinary/gen/vrdivc-minmax-scalar-x4.c
+  src/f32-vbinary/gen/vrdivc-minmax-scalar-x8.c
   src/f32-vbinary/gen/vrdivc-relu-scalar-x1.c
   src/f32-vbinary/gen/vrdivc-relu-scalar-x2.c
   src/f32-vbinary/gen/vrdivc-relu-scalar-x4.c
+  src/f32-vbinary/gen/vrdivc-relu-scalar-x8.c
   src/f32-vbinary/gen/vrdivc-scalar-x1.c
   src/f32-vbinary/gen/vrdivc-scalar-x2.c
   src/f32-vbinary/gen/vrdivc-scalar-x4.c
+  src/f32-vbinary/gen/vrdivc-scalar-x8.c
   src/f32-vbinary/gen/vrsubc-minmax-scalar-x1.c
   src/f32-vbinary/gen/vrsubc-minmax-scalar-x2.c
   src/f32-vbinary/gen/vrsubc-minmax-scalar-x4.c
+  src/f32-vbinary/gen/vrsubc-minmax-scalar-x8.c
   src/f32-vbinary/gen/vrsubc-relu-scalar-x1.c
   src/f32-vbinary/gen/vrsubc-relu-scalar-x2.c
   src/f32-vbinary/gen/vrsubc-relu-scalar-x4.c
+  src/f32-vbinary/gen/vrsubc-relu-scalar-x8.c
   src/f32-vbinary/gen/vrsubc-scalar-x1.c
   src/f32-vbinary/gen/vrsubc-scalar-x2.c
   src/f32-vbinary/gen/vrsubc-scalar-x4.c
+  src/f32-vbinary/gen/vrsubc-scalar-x8.c
   src/f32-vbinary/gen/vsqrdiff-scalar-x1.c
   src/f32-vbinary/gen/vsqrdiff-scalar-x2.c
   src/f32-vbinary/gen/vsqrdiff-scalar-x4.c
+  src/f32-vbinary/gen/vsqrdiff-scalar-x8.c
   src/f32-vbinary/gen/vsqrdiffc-scalar-x1.c
   src/f32-vbinary/gen/vsqrdiffc-scalar-x2.c
   src/f32-vbinary/gen/vsqrdiffc-scalar-x4.c
+  src/f32-vbinary/gen/vsqrdiffc-scalar-x8.c
   src/f32-vbinary/gen/vsub-minmax-scalar-x1.c
   src/f32-vbinary/gen/vsub-minmax-scalar-x2.c
   src/f32-vbinary/gen/vsub-minmax-scalar-x4.c
+  src/f32-vbinary/gen/vsub-minmax-scalar-x8.c
   src/f32-vbinary/gen/vsub-relu-scalar-x1.c
   src/f32-vbinary/gen/vsub-relu-scalar-x2.c
   src/f32-vbinary/gen/vsub-relu-scalar-x4.c
+  src/f32-vbinary/gen/vsub-relu-scalar-x8.c
   src/f32-vbinary/gen/vsub-scalar-x1.c
   src/f32-vbinary/gen/vsub-scalar-x2.c
   src/f32-vbinary/gen/vsub-scalar-x4.c
+  src/f32-vbinary/gen/vsub-scalar-x8.c
   src/f32-vbinary/gen/vsubc-minmax-scalar-x1.c
   src/f32-vbinary/gen/vsubc-minmax-scalar-x2.c
   src/f32-vbinary/gen/vsubc-minmax-scalar-x4.c
+  src/f32-vbinary/gen/vsubc-minmax-scalar-x8.c
   src/f32-vbinary/gen/vsubc-relu-scalar-x1.c
   src/f32-vbinary/gen/vsubc-relu-scalar-x2.c
   src/f32-vbinary/gen/vsubc-relu-scalar-x4.c
+  src/f32-vbinary/gen/vsubc-relu-scalar-x8.c
   src/f32-vbinary/gen/vsubc-scalar-x1.c
   src/f32-vbinary/gen/vsubc-scalar-x2.c
   src/f32-vbinary/gen/vsubc-scalar-x4.c
+  src/f32-vbinary/gen/vsubc-scalar-x8.c
   src/f32-vmulcaddc/gen/c1-minmax-scalar-2x.c
   src/f32-vmulcaddc/gen/c2-minmax-scalar-2x.c
   src/f32-vmulcaddc/gen/c4-minmax-scalar-2x.c
diff --git a/scripts/generate-f32-vbinary.sh b/scripts/generate-f32-vbinary.sh
index c0ecaae..abab3bf 100755
--- a/scripts/generate-f32-vbinary.sh
+++ b/scripts/generate-f32-vbinary.sh
@@ -9,301 +9,410 @@
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vadd-minmax-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vadd-minmax-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vadd-minmax-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vadd-minmax-scalar-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdiv-minmax-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdiv-minmax-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdiv-minmax-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdiv-minmax-scalar-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmul-minmax-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmul-minmax-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmul-minmax-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmul-minmax-scalar-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsub-minmax-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsub-minmax-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsub-minmax-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsub-minmax-scalar-x8.c
 
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vadd-relu-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vadd-relu-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vadd-relu-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vadd-relu-scalar-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdiv-relu-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdiv-relu-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdiv-relu-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdiv-relu-scalar-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmul-relu-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmul-relu-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmul-relu-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmul-relu-scalar-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsub-relu-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsub-relu-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsub-relu-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsub-relu-scalar-x8.c
 
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vadd-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vadd-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vadd-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vadd-scalar-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vdiv-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vdiv-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vdiv-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vdiv-scalar-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmax-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmax-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmax-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmax-scalar-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmin-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmin-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmin-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmin-scalar-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmul-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmul-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmul-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmul-scalar-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsqrdiff-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsqrdiff-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsqrdiff-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsqrdiff-scalar-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsub-scalar-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsub-scalar-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsub-scalar-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsub-scalar-x8.c
 
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vaddc-minmax-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vaddc-minmax-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vaddc-minmax-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vaddc-minmax-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdivc-minmax-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdivc-minmax-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdivc-minmax-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdivc-minmax-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmulc-minmax-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmulc-minmax-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmulc-minmax-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmulc-minmax-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrdivc-minmax-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrdivc-minmax-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrdivc-minmax-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrdivc-minmax-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrsubc-minmax-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrsubc-minmax-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrsubc-minmax-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrsubc-minmax-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsubc-minmax-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsubc-minmax-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsubc-minmax-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsubc-minmax-scalar-x8.c
 
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vaddc-relu-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vaddc-relu-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vaddc-relu-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vaddc-relu-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdivc-relu-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdivc-relu-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdivc-relu-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdivc-relu-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmulc-relu-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmulc-relu-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmulc-relu-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmulc-relu-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrdivc-relu-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrdivc-relu-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrdivc-relu-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrdivc-relu-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrsubc-relu-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrsubc-relu-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrsubc-relu-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrsubc-relu-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsubc-relu-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsubc-relu-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsubc-relu-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsubc-relu-scalar-x8.c
 
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vaddc-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vaddc-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vaddc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vaddc-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vdivc-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vdivc-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vdivc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vdivc-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmaxc-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmaxc-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmaxc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmaxc-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vminc-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vminc-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vminc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vminc-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmulc-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmulc-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmulc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmulc-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vrdivc-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vrdivc-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vrdivc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vrdivc-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB     -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vrsubc-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB     -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vrsubc-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB     -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vrsubc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB     -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vrsubc-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SQRDIFF  -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsqrdiffc-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SQRDIFF  -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsqrdiffc-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SQRDIFF  -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsqrdiffc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SQRDIFF  -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsqrdiffc-scalar-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB      -D BATCH_TILE=1 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsubc-scalar-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB      -D BATCH_TILE=2 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsubc-scalar-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB      -D BATCH_TILE=4 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsubc-scalar-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB      -D BATCH_TILE=8 -D WASM=0 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vsubc-scalar-x8.c
 
 ### WAsm-specific micro-kernels
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vadd-minmax-wasm-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vadd-minmax-wasm-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vadd-minmax-wasm-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vadd-minmax-wasm-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdiv-minmax-wasm-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdiv-minmax-wasm-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdiv-minmax-wasm-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdiv-minmax-wasm-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmax-wasm-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmax-wasm-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmax-wasm-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MAX -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmax-wasm-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmin-wasm-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmin-wasm-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmin-wasm-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MIN -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmin-wasm-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmul-minmax-wasm-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmul-minmax-wasm-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmul-minmax-wasm-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmul-minmax-wasm-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsub-minmax-wasm-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsub-minmax-wasm-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsub-minmax-wasm-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsub-minmax-wasm-x8.c
 
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vadd-relu-wasm-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vadd-relu-wasm-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vadd-relu-wasm-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=ADD -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vadd-relu-wasm-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdiv-relu-wasm-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdiv-relu-wasm-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdiv-relu-wasm-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=DIV -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdiv-relu-wasm-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmul-relu-wasm-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmul-relu-wasm-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmul-relu-wasm-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=MUL -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmul-relu-wasm-x8.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsub-relu-wasm-x1.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsub-relu-wasm-x2.c
 tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsub-relu-wasm-x4.c
+tools/xngen src/f32-vbinary/vop-scalar.c.in -D OP=SUB -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsub-relu-wasm-x8.c
 
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD  -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vaddc-minmax-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD  -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vaddc-minmax-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD  -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vaddc-minmax-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD  -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vaddc-minmax-wasm-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV  -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdivc-minmax-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV  -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdivc-minmax-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV  -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdivc-minmax-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV  -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vdivc-minmax-wasm-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX  -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmaxc-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX  -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmaxc-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX  -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmaxc-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MAX  -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vmaxc-wasm-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN  -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vminc-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN  -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vminc-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN  -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vminc-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MIN  -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=LINEAR -o src/f32-vbinary/gen/vminc-wasm-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL  -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmulc-minmax-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL  -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmulc-minmax-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL  -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmulc-minmax-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL  -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vmulc-minmax-wasm-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrdivc-minmax-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrdivc-minmax-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrdivc-minmax-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrdivc-minmax-wasm-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrsubc-minmax-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrsubc-minmax-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrsubc-minmax-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vrsubc-minmax-wasm-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB  -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsubc-minmax-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB  -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsubc-minmax-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB  -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsubc-minmax-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB  -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsubc-minmax-wasm-x8.c
 
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD  -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vaddc-relu-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD  -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vaddc-relu-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD  -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vaddc-relu-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=ADD  -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vaddc-relu-wasm-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV  -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdivc-relu-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV  -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdivc-relu-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV  -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdivc-relu-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=DIV  -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vdivc-relu-wasm-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL  -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmulc-relu-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL  -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmulc-relu-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL  -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmulc-relu-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=MUL  -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vmulc-relu-wasm-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrdivc-relu-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrdivc-relu-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrdivc-relu-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RDIV -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrdivc-relu-wasm-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrsubc-relu-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrsubc-relu-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrsubc-relu-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=RSUB -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vrsubc-relu-wasm-x8.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB  -D BATCH_TILE=1 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsubc-relu-wasm-x1.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB  -D BATCH_TILE=2 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsubc-relu-wasm-x2.c
 tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB  -D BATCH_TILE=4 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsubc-relu-wasm-x4.c
+tools/xngen src/f32-vbinary/vopc-scalar.c.in -D OP=SUB  -D BATCH_TILE=8 -D WASM=1 -D ACTIVATION=RELU -o src/f32-vbinary/gen/vsubc-relu-wasm-x8.c
 
 ################################## WAsm SIMD ##################################
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vadd-minmax-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vadd-minmax-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vadd-minmax-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vadd-minmax-wasmsimd-x86-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vdiv-minmax-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vdiv-minmax-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vdiv-minmax-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vdiv-minmax-wasmsimd-x86-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vmul-minmax-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vmul-minmax-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vmul-minmax-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vmul-minmax-wasmsimd-x86-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vsub-minmax-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vsub-minmax-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vsub-minmax-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vsub-minmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vadd-minmax-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vadd-minmax-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vadd-minmax-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vadd-minmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vadd-minmax-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vadd-minmax-wasmsimd-x86-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vdiv-minmax-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vdiv-minmax-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vdiv-minmax-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vdiv-minmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vdiv-minmax-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vdiv-minmax-wasmsimd-x86-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vmul-minmax-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vmul-minmax-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vmul-minmax-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vmul-minmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vmul-minmax-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vmul-minmax-wasmsimd-x86-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vsub-minmax-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vsub-minmax-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vsub-minmax-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vsub-minmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vsub-minmax-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vsub-minmax-wasmsimd-x86-x16.c
 
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vadd-relu-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vadd-relu-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vdiv-relu-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vdiv-relu-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vmul-relu-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vmul-relu-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vsub-relu-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vsub-relu-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=4  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vadd-relu-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=8  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vadd-relu-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD -D BATCH_TILE=16 -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vadd-relu-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=4  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vdiv-relu-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=8  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vdiv-relu-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV -D BATCH_TILE=16 -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vdiv-relu-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=4  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vmul-relu-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=8  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vmul-relu-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL -D BATCH_TILE=16 -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vmul-relu-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=4  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vsub-relu-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=8  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vsub-relu-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB -D BATCH_TILE=16 -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vsub-relu-wasmsimd-x16.c
 
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vadd-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vadd-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vdiv-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vdiv-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmax-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmax-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmax-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmax-wasmsimd-x86-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmin-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmin-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmin-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmin-wasmsimd-x86-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmul-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmul-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsqrdiff-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsqrdiff-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsub-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsub-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vadd-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vadd-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=ADD     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vadd-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vdiv-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vdiv-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=DIV     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vdiv-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmax-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmax-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmax-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmax-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmax-wasmsimd-x86-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmin-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmin-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmin-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmin-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmin-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmin-wasmsimd-x86-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmul-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmul-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=MUL     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmul-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsqrdiff-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsqrdiff-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsqrdiff-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsub-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsub-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vop-wasmsimd.c.in -D OP=SUB     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsub-wasmsimd-x16.c
 
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=4 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=8 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=4  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=8  -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=0 -o src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=16 -D ACTIVATION=MINMAX -D X86=1 -o src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x16.c
 
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=4 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vaddc-relu-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=8 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vaddc-relu-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=4 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vdivc-relu-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=8 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vdivc-relu-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=4 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vmulc-relu-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=8 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vmulc-relu-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=4 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=8 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=4 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=8 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=4 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vsubc-relu-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=8 -D ACTIVATION=RELU -D X86=1 -o src/f32-vbinary/gen/vsubc-relu-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=4  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vaddc-relu-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=8  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vaddc-relu-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD  -D BATCH_TILE=16 -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vaddc-relu-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=4  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vdivc-relu-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=8  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vdivc-relu-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV  -D BATCH_TILE=16 -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vdivc-relu-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=4  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vmulc-relu-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=8  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vmulc-relu-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL  -D BATCH_TILE=16 -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vmulc-relu-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=4  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=8  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV -D BATCH_TILE=16 -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=4  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=8  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB -D BATCH_TILE=16 -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=4  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vsubc-relu-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=8  -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vsubc-relu-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB  -D BATCH_TILE=16 -D ACTIVATION=RELU -D X86=0 -o src/f32-vbinary/gen/vsubc-relu-wasmsimd-x16.c
 
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vaddc-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vaddc-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vdivc-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vdivc-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vminc-wasmsimd-arm-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vminc-wasmsimd-x86-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vminc-wasmsimd-arm-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vminc-wasmsimd-x86-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmulc-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmulc-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV    -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vrdivc-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV    -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vrdivc-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB    -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vrsubc-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB    -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vrsubc-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x8.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB     -D BATCH_TILE=4 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vsubc-wasmsimd-x4.c
-tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB     -D BATCH_TILE=8 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vsubc-wasmsimd-x8.c
-
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vaddc-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vaddc-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=ADD     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vaddc-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vdivc-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vdivc-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=DIV     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vdivc-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MAX     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vminc-wasmsimd-arm-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vminc-wasmsimd-x86-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vminc-wasmsimd-arm-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vminc-wasmsimd-x86-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vminc-wasmsimd-arm-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MIN     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=1 -o src/f32-vbinary/gen/vminc-wasmsimd-x86-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmulc-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmulc-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=MUL     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vmulc-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV    -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vrdivc-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV    -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vrdivc-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RDIV    -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vrdivc-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB    -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vrsubc-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB    -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vrsubc-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=RSUB    -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vrsubc-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SQRDIFF -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x16.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB     -D BATCH_TILE=4  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsubc-wasmsimd-x4.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB     -D BATCH_TILE=8  -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsubc-wasmsimd-x8.c
+tools/xngen src/f32-vbinary/vopc-wasmsimd.c.in -D OP=SUB     -D BATCH_TILE=16 -D ACTIVATION=LINEAR -D X86=0 -o src/f32-vbinary/gen/vsubc-wasmsimd-x16.c
 
 ################################### ARM NEON ##################################
 tools/xngen src/f32-vbinary/vop-neon.c.in -D OP=ADD     -D BATCH_TILE=4 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vadd-minmax-neon-x4.c
diff --git a/src/f32-vbinary/gen/vadd-minmax-scalar-x8.c b/src/f32-vbinary/gen/vadd-minmax-scalar-x8.c
new file mode 100644
index 0000000..b55e3dd
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-minmax-scalar-x8.c
@@ -0,0 +1,103 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_minmax_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+    float vy2 = va2 + vb2;
+    float vy3 = va3 + vb3;
+    float vy4 = va4 + vb4;
+    float vy5 = va5 + vb5;
+    float vy6 = va6 + vb6;
+    float vy7 = va7 + vb7;
+
+
+    vy0 = math_max_f32(vy0, vy_min);
+    vy1 = math_max_f32(vy1, vy_min);
+    vy2 = math_max_f32(vy2, vy_min);
+    vy3 = math_max_f32(vy3, vy_min);
+    vy4 = math_max_f32(vy4, vy_min);
+    vy5 = math_max_f32(vy5, vy_min);
+    vy6 = math_max_f32(vy6, vy_min);
+    vy7 = math_max_f32(vy7, vy_min);
+
+    vy0 = math_min_f32(vy0, vy_max);
+    vy1 = math_min_f32(vy1, vy_max);
+    vy2 = math_min_f32(vy2, vy_max);
+    vy3 = math_min_f32(vy3, vy_max);
+    vy4 = math_min_f32(vy4, vy_max);
+    vy5 = math_min_f32(vy5, vy_max);
+    vy6 = math_min_f32(vy6, vy_max);
+    vy7 = math_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va + vb;
+      vy = math_max_f32(vy, vy_min);
+      vy = math_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-minmax-wasm-x8.c b/src/f32-vbinary/gen/vadd-minmax-wasm-x8.c
new file mode 100644
index 0000000..700ff5c
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-minmax-wasm-x8.c
@@ -0,0 +1,103 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_minmax_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+    float vy2 = va2 + vb2;
+    float vy3 = va3 + vb3;
+    float vy4 = va4 + vb4;
+    float vy5 = va5 + vb5;
+    float vy6 = va6 + vb6;
+    float vy7 = va7 + vb7;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+    vy4 = __builtin_wasm_max_f32(vy4, vy_min);
+    vy5 = __builtin_wasm_max_f32(vy5, vy_min);
+    vy6 = __builtin_wasm_max_f32(vy6, vy_min);
+    vy7 = __builtin_wasm_max_f32(vy7, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+    vy4 = __builtin_wasm_min_f32(vy4, vy_max);
+    vy5 = __builtin_wasm_min_f32(vy5, vy_max);
+    vy6 = __builtin_wasm_min_f32(vy6, vy_max);
+    vy7 = __builtin_wasm_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va + vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-minmax-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vadd-minmax-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..643d32d
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-minmax-wasmsimd-arm-x16.c
@@ -0,0 +1,102 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_add(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_add(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_add(vaCDEF, vbCDEF);
+
+
+    vy0123 = wasm_f32x4_max(vy0123, vy_min);
+    vy4567 = wasm_f32x4_max(vy4567, vy_min);
+    vy89AB = wasm_f32x4_max(vy89AB, vy_min);
+    vyCDEF = wasm_f32x4_max(vyCDEF, vy_min);
+
+    vy0123 = wasm_f32x4_min(vy0123, vy_max);
+    vy4567 = wasm_f32x4_min(vy4567, vy_max);
+    vy89AB = wasm_f32x4_min(vy89AB, vy_max);
+    vyCDEF = wasm_f32x4_min(vyCDEF, vy_max);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-minmax-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vadd-minmax-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..d3f2825
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-minmax-wasmsimd-x86-x16.c
@@ -0,0 +1,115 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_add(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_add(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_add(vaCDEF, vbCDEF);
+
+
+    const v128_t vltmask0123 = wasm_f32x4_lt(vy0123, vy_min);
+    const v128_t vltmask4567 = wasm_f32x4_lt(vy4567, vy_min);
+    const v128_t vltmask89AB = wasm_f32x4_lt(vy89AB, vy_min);
+    const v128_t vltmaskCDEF = wasm_f32x4_lt(vyCDEF, vy_min);
+
+    const v128_t vngtmask0123 = wasm_f32x4_le(vy0123, vy_max);
+    vy0123 = wasm_v128_bitselect(vy_min, vy0123, vltmask0123);
+    const v128_t vngtmask4567 = wasm_f32x4_le(vy4567, vy_max);
+    vy4567 = wasm_v128_bitselect(vy_min, vy4567, vltmask4567);
+    const v128_t vngtmask89AB = wasm_f32x4_le(vy89AB, vy_max);
+    vy89AB = wasm_v128_bitselect(vy_min, vy89AB, vltmask89AB);
+    const v128_t vngtmaskCDEF = wasm_f32x4_le(vyCDEF, vy_max);
+    vyCDEF = wasm_v128_bitselect(vy_min, vyCDEF, vltmaskCDEF);
+
+    vy0123 = wasm_v128_bitselect(vy0123, vy_max, vngtmask0123);
+    vy4567 = wasm_v128_bitselect(vy4567, vy_max, vngtmask4567);
+    vy89AB = wasm_v128_bitselect(vy89AB, vy_max, vngtmask89AB);
+    vyCDEF = wasm_v128_bitselect(vyCDEF, vy_max, vngtmaskCDEF);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-scalar-x8.c b/src/f32-vbinary/gen/vadd-relu-scalar-x8.c
new file mode 100644
index 0000000..17f432d
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-scalar-x8.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+    float vy2 = va2 + vb2;
+    float vy3 = va3 + vb3;
+    float vy4 = va4 + vb4;
+    float vy5 = va5 + vb5;
+    float vy6 = va6 + vb6;
+    float vy7 = va7 + vb7;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+    vy4 = math_max_f32(vy4, 0.0f);
+    vy5 = math_max_f32(vy5, 0.0f);
+    vy6 = math_max_f32(vy6, 0.0f);
+    vy7 = math_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va + vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-wasm-x8.c b/src/f32-vbinary/gen/vadd-relu-wasm-x8.c
new file mode 100644
index 0000000..37ae22d
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-wasm-x8.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+    float vy2 = va2 + vb2;
+    float vy3 = va3 + vb3;
+    float vy4 = va4 + vb4;
+    float vy5 = va5 + vb5;
+    float vy6 = va6 + vb6;
+    float vy7 = va7 + vb7;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+    vy4 = __builtin_wasm_max_f32(vy4, 0.0f);
+    vy5 = __builtin_wasm_max_f32(vy5, 0.0f);
+    vy6 = __builtin_wasm_max_f32(vy6, 0.0f);
+    vy7 = __builtin_wasm_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va + vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-relu-wasmsimd-x16.c b/src/f32-vbinary/gen/vadd-relu-wasmsimd-x16.c
new file mode 100644
index 0000000..5174e12
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-relu-wasmsimd-x16.c
@@ -0,0 +1,94 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_relu_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_add(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_add(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_add(vaCDEF, vbCDEF);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+    vy89AB = wasm_i32x4_max(vy89AB, vzero);
+    vyCDEF = wasm_i32x4_max(vyCDEF, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-scalar-x8.c b/src/f32-vbinary/gen/vadd-scalar-x8.c
new file mode 100644
index 0000000..85b0d00
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-scalar-x8.c
@@ -0,0 +1,82 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+    float vy2 = va2 + vb2;
+    float vy3 = va3 + vb3;
+    float vy4 = va4 + vb4;
+    float vy5 = va5 + vb5;
+    float vy6 = va6 + vb6;
+    float vy7 = va7 + vb7;
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va + vb;
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vadd-wasmsimd-x16.c b/src/f32-vbinary/gen/vadd-wasmsimd-x16.c
new file mode 100644
index 0000000..df81054
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-wasmsimd-x16.c
@@ -0,0 +1,87 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_add(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_add(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_add(vaCDEF, vbCDEF);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-minmax-scalar-x8.c b/src/f32-vbinary/gen/vaddc-minmax-scalar-x8.c
new file mode 100644
index 0000000..f9454a8
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-minmax-scalar-x8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_minmax_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+    float vy2 = va2 + vb;
+    float vy3 = va3 + vb;
+    float vy4 = va4 + vb;
+    float vy5 = va5 + vb;
+    float vy6 = va6 + vb;
+    float vy7 = va7 + vb;
+
+
+    vy0 = math_max_f32(vy0, vy_min);
+    vy1 = math_max_f32(vy1, vy_min);
+    vy2 = math_max_f32(vy2, vy_min);
+    vy3 = math_max_f32(vy3, vy_min);
+    vy4 = math_max_f32(vy4, vy_min);
+    vy5 = math_max_f32(vy5, vy_min);
+    vy6 = math_max_f32(vy6, vy_min);
+    vy7 = math_max_f32(vy7, vy_min);
+
+    vy0 = math_min_f32(vy0, vy_max);
+    vy1 = math_min_f32(vy1, vy_max);
+    vy2 = math_min_f32(vy2, vy_max);
+    vy3 = math_min_f32(vy3, vy_max);
+    vy4 = math_min_f32(vy4, vy_max);
+    vy5 = math_min_f32(vy5, vy_max);
+    vy6 = math_min_f32(vy6, vy_max);
+    vy7 = math_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va + vb;
+      vy = math_max_f32(vy, vy_min);
+      vy = math_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-minmax-wasm-x8.c b/src/f32-vbinary/gen/vaddc-minmax-wasm-x8.c
new file mode 100644
index 0000000..9ce6808
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-minmax-wasm-x8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_minmax_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+    float vy2 = va2 + vb;
+    float vy3 = va3 + vb;
+    float vy4 = va4 + vb;
+    float vy5 = va5 + vb;
+    float vy6 = va6 + vb;
+    float vy7 = va7 + vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+    vy4 = __builtin_wasm_max_f32(vy4, vy_min);
+    vy5 = __builtin_wasm_max_f32(vy5, vy_min);
+    vy6 = __builtin_wasm_max_f32(vy6, vy_min);
+    vy7 = __builtin_wasm_max_f32(vy7, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+    vy4 = __builtin_wasm_min_f32(vy4, vy_max);
+    vy5 = __builtin_wasm_min_f32(vy5, vy_max);
+    vy6 = __builtin_wasm_min_f32(vy6, vy_max);
+    vy7 = __builtin_wasm_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va + vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..cc0a98b
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x16.c
@@ -0,0 +1,92 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_add(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_add(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_add(vaCDEF, vb);
+
+
+    vy0123 = wasm_f32x4_max(vy0123, vy_min);
+    vy4567 = wasm_f32x4_max(vy4567, vy_min);
+    vy89AB = wasm_f32x4_max(vy89AB, vy_min);
+    vyCDEF = wasm_f32x4_max(vyCDEF, vy_min);
+
+    vy0123 = wasm_f32x4_min(vy0123, vy_max);
+    vy4567 = wasm_f32x4_min(vy4567, vy_max);
+    vy89AB = wasm_f32x4_min(vy89AB, vy_max);
+    vyCDEF = wasm_f32x4_min(vyCDEF, vy_max);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..1bc2d8b
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x16.c
@@ -0,0 +1,105 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_add(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_add(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_add(vaCDEF, vb);
+
+
+    const v128_t vltmask0123 = wasm_f32x4_lt(vy0123, vy_min);
+    const v128_t vltmask4567 = wasm_f32x4_lt(vy4567, vy_min);
+    const v128_t vltmask89AB = wasm_f32x4_lt(vy89AB, vy_min);
+    const v128_t vltmaskCDEF = wasm_f32x4_lt(vyCDEF, vy_min);
+
+    const v128_t vngtmask0123 = wasm_f32x4_le(vy0123, vy_max);
+    vy0123 = wasm_v128_bitselect(vy_min, vy0123, vltmask0123);
+    const v128_t vngtmask4567 = wasm_f32x4_le(vy4567, vy_max);
+    vy4567 = wasm_v128_bitselect(vy_min, vy4567, vltmask4567);
+    const v128_t vngtmask89AB = wasm_f32x4_le(vy89AB, vy_max);
+    vy89AB = wasm_v128_bitselect(vy_min, vy89AB, vltmask89AB);
+    const v128_t vngtmaskCDEF = wasm_f32x4_le(vyCDEF, vy_max);
+    vyCDEF = wasm_v128_bitselect(vy_min, vyCDEF, vltmaskCDEF);
+
+    vy0123 = wasm_v128_bitselect(vy0123, vy_max, vngtmask0123);
+    vy4567 = wasm_v128_bitselect(vy4567, vy_max, vngtmask4567);
+    vy89AB = wasm_v128_bitselect(vy89AB, vy_max, vngtmask89AB);
+    vyCDEF = wasm_v128_bitselect(vyCDEF, vy_max, vngtmaskCDEF);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-scalar-x8.c b/src/f32-vbinary/gen/vaddc-relu-scalar-x8.c
new file mode 100644
index 0000000..104930f
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-scalar-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+    float vy2 = va2 + vb;
+    float vy3 = va3 + vb;
+    float vy4 = va4 + vb;
+    float vy5 = va5 + vb;
+    float vy6 = va6 + vb;
+    float vy7 = va7 + vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+    vy4 = math_max_f32(vy4, 0.0f);
+    vy5 = math_max_f32(vy5, 0.0f);
+    vy6 = math_max_f32(vy6, 0.0f);
+    vy7 = math_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va + vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-wasm-x8.c b/src/f32-vbinary/gen/vaddc-relu-wasm-x8.c
new file mode 100644
index 0000000..79b61a8
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-wasm-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+    float vy2 = va2 + vb;
+    float vy3 = va3 + vb;
+    float vy4 = va4 + vb;
+    float vy5 = va5 + vb;
+    float vy6 = va6 + vb;
+    float vy7 = va7 + vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+    vy4 = __builtin_wasm_max_f32(vy4, 0.0f);
+    vy5 = __builtin_wasm_max_f32(vy5, 0.0f);
+    vy6 = __builtin_wasm_max_f32(vy6, 0.0f);
+    vy7 = __builtin_wasm_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va + vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x16.c b/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x16.c
new file mode 100644
index 0000000..a560320
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-relu-wasmsimd-x16.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_relu_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_add(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_add(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_add(vaCDEF, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+    vy89AB = wasm_i32x4_max(vy89AB, vzero);
+    vyCDEF = wasm_i32x4_max(vyCDEF, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-scalar-x8.c b/src/f32-vbinary/gen/vaddc-scalar-x8.c
new file mode 100644
index 0000000..93571a2
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-scalar-x8.c
@@ -0,0 +1,72 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+    float vy2 = va2 + vb;
+    float vy3 = va3 + vb;
+    float vy4 = va4 + vb;
+    float vy5 = va5 + vb;
+    float vy6 = va6 + vb;
+    float vy7 = va7 + vb;
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va + vb;
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vaddc-wasmsimd-x16.c b/src/f32-vbinary/gen/vaddc-wasmsimd-x16.c
new file mode 100644
index 0000000..f43302a
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-wasmsimd-x16.c
@@ -0,0 +1,77 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_add(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_add(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_add(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_add(vaCDEF, vb);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_add(va, vb);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-minmax-scalar-x8.c b/src/f32-vbinary/gen/vdiv-minmax-scalar-x8.c
new file mode 100644
index 0000000..6799a48
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-minmax-scalar-x8.c
@@ -0,0 +1,103 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_minmax_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+    float vy2 = va2 / vb2;
+    float vy3 = va3 / vb3;
+    float vy4 = va4 / vb4;
+    float vy5 = va5 / vb5;
+    float vy6 = va6 / vb6;
+    float vy7 = va7 / vb7;
+
+
+    vy0 = math_max_f32(vy0, vy_min);
+    vy1 = math_max_f32(vy1, vy_min);
+    vy2 = math_max_f32(vy2, vy_min);
+    vy3 = math_max_f32(vy3, vy_min);
+    vy4 = math_max_f32(vy4, vy_min);
+    vy5 = math_max_f32(vy5, vy_min);
+    vy6 = math_max_f32(vy6, vy_min);
+    vy7 = math_max_f32(vy7, vy_min);
+
+    vy0 = math_min_f32(vy0, vy_max);
+    vy1 = math_min_f32(vy1, vy_max);
+    vy2 = math_min_f32(vy2, vy_max);
+    vy3 = math_min_f32(vy3, vy_max);
+    vy4 = math_min_f32(vy4, vy_max);
+    vy5 = math_min_f32(vy5, vy_max);
+    vy6 = math_min_f32(vy6, vy_max);
+    vy7 = math_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va / vb;
+      vy = math_max_f32(vy, vy_min);
+      vy = math_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-minmax-wasm-x8.c b/src/f32-vbinary/gen/vdiv-minmax-wasm-x8.c
new file mode 100644
index 0000000..a769fce
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-minmax-wasm-x8.c
@@ -0,0 +1,103 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_minmax_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+    float vy2 = va2 / vb2;
+    float vy3 = va3 / vb3;
+    float vy4 = va4 / vb4;
+    float vy5 = va5 / vb5;
+    float vy6 = va6 / vb6;
+    float vy7 = va7 / vb7;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+    vy4 = __builtin_wasm_max_f32(vy4, vy_min);
+    vy5 = __builtin_wasm_max_f32(vy5, vy_min);
+    vy6 = __builtin_wasm_max_f32(vy6, vy_min);
+    vy7 = __builtin_wasm_max_f32(vy7, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+    vy4 = __builtin_wasm_min_f32(vy4, vy_max);
+    vy5 = __builtin_wasm_min_f32(vy5, vy_max);
+    vy6 = __builtin_wasm_min_f32(vy6, vy_max);
+    vy7 = __builtin_wasm_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va / vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-minmax-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vdiv-minmax-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..108500a
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-minmax-wasmsimd-arm-x16.c
@@ -0,0 +1,102 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_div(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_div(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_div(vaCDEF, vbCDEF);
+
+
+    vy0123 = wasm_f32x4_max(vy0123, vy_min);
+    vy4567 = wasm_f32x4_max(vy4567, vy_min);
+    vy89AB = wasm_f32x4_max(vy89AB, vy_min);
+    vyCDEF = wasm_f32x4_max(vyCDEF, vy_min);
+
+    vy0123 = wasm_f32x4_min(vy0123, vy_max);
+    vy4567 = wasm_f32x4_min(vy4567, vy_max);
+    vy89AB = wasm_f32x4_min(vy89AB, vy_max);
+    vyCDEF = wasm_f32x4_min(vyCDEF, vy_max);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-minmax-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vdiv-minmax-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..d3d4c00
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-minmax-wasmsimd-x86-x16.c
@@ -0,0 +1,115 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_div(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_div(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_div(vaCDEF, vbCDEF);
+
+
+    const v128_t vltmask0123 = wasm_f32x4_lt(vy0123, vy_min);
+    const v128_t vltmask4567 = wasm_f32x4_lt(vy4567, vy_min);
+    const v128_t vltmask89AB = wasm_f32x4_lt(vy89AB, vy_min);
+    const v128_t vltmaskCDEF = wasm_f32x4_lt(vyCDEF, vy_min);
+
+    const v128_t vngtmask0123 = wasm_f32x4_le(vy0123, vy_max);
+    vy0123 = wasm_v128_bitselect(vy_min, vy0123, vltmask0123);
+    const v128_t vngtmask4567 = wasm_f32x4_le(vy4567, vy_max);
+    vy4567 = wasm_v128_bitselect(vy_min, vy4567, vltmask4567);
+    const v128_t vngtmask89AB = wasm_f32x4_le(vy89AB, vy_max);
+    vy89AB = wasm_v128_bitselect(vy_min, vy89AB, vltmask89AB);
+    const v128_t vngtmaskCDEF = wasm_f32x4_le(vyCDEF, vy_max);
+    vyCDEF = wasm_v128_bitselect(vy_min, vyCDEF, vltmaskCDEF);
+
+    vy0123 = wasm_v128_bitselect(vy0123, vy_max, vngtmask0123);
+    vy4567 = wasm_v128_bitselect(vy4567, vy_max, vngtmask4567);
+    vy89AB = wasm_v128_bitselect(vy89AB, vy_max, vngtmask89AB);
+    vyCDEF = wasm_v128_bitselect(vyCDEF, vy_max, vngtmaskCDEF);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-scalar-x8.c b/src/f32-vbinary/gen/vdiv-relu-scalar-x8.c
new file mode 100644
index 0000000..e03e221
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-scalar-x8.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+    float vy2 = va2 / vb2;
+    float vy3 = va3 / vb3;
+    float vy4 = va4 / vb4;
+    float vy5 = va5 / vb5;
+    float vy6 = va6 / vb6;
+    float vy7 = va7 / vb7;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+    vy4 = math_max_f32(vy4, 0.0f);
+    vy5 = math_max_f32(vy5, 0.0f);
+    vy6 = math_max_f32(vy6, 0.0f);
+    vy7 = math_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va / vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-wasm-x8.c b/src/f32-vbinary/gen/vdiv-relu-wasm-x8.c
new file mode 100644
index 0000000..74e9951
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-wasm-x8.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+    float vy2 = va2 / vb2;
+    float vy3 = va3 / vb3;
+    float vy4 = va4 / vb4;
+    float vy5 = va5 / vb5;
+    float vy6 = va6 / vb6;
+    float vy7 = va7 / vb7;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+    vy4 = __builtin_wasm_max_f32(vy4, 0.0f);
+    vy5 = __builtin_wasm_max_f32(vy5, 0.0f);
+    vy6 = __builtin_wasm_max_f32(vy6, 0.0f);
+    vy7 = __builtin_wasm_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va / vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x16.c b/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x16.c
new file mode 100644
index 0000000..b0ed81a
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-relu-wasmsimd-x16.c
@@ -0,0 +1,94 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_relu_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_div(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_div(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_div(vaCDEF, vbCDEF);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+    vy89AB = wasm_i32x4_max(vy89AB, vzero);
+    vyCDEF = wasm_i32x4_max(vyCDEF, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-scalar-x8.c b/src/f32-vbinary/gen/vdiv-scalar-x8.c
new file mode 100644
index 0000000..8535b87
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-scalar-x8.c
@@ -0,0 +1,82 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 / vb0;
+    float vy1 = va1 / vb1;
+    float vy2 = va2 / vb2;
+    float vy3 = va3 / vb3;
+    float vy4 = va4 / vb4;
+    float vy5 = va5 / vb5;
+    float vy6 = va6 / vb6;
+    float vy7 = va7 / vb7;
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va / vb;
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdiv-wasmsimd-x16.c b/src/f32-vbinary/gen/vdiv-wasmsimd-x16.c
new file mode 100644
index 0000000..f907b74
--- /dev/null
+++ b/src/f32-vbinary/gen/vdiv-wasmsimd-x16.c
@@ -0,0 +1,87 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdiv_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_div(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_div(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_div(vaCDEF, vbCDEF);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-minmax-scalar-x8.c b/src/f32-vbinary/gen/vdivc-minmax-scalar-x8.c
new file mode 100644
index 0000000..d7d2b78
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-minmax-scalar-x8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_minmax_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+    float vy2 = va2 / vb;
+    float vy3 = va3 / vb;
+    float vy4 = va4 / vb;
+    float vy5 = va5 / vb;
+    float vy6 = va6 / vb;
+    float vy7 = va7 / vb;
+
+
+    vy0 = math_max_f32(vy0, vy_min);
+    vy1 = math_max_f32(vy1, vy_min);
+    vy2 = math_max_f32(vy2, vy_min);
+    vy3 = math_max_f32(vy3, vy_min);
+    vy4 = math_max_f32(vy4, vy_min);
+    vy5 = math_max_f32(vy5, vy_min);
+    vy6 = math_max_f32(vy6, vy_min);
+    vy7 = math_max_f32(vy7, vy_min);
+
+    vy0 = math_min_f32(vy0, vy_max);
+    vy1 = math_min_f32(vy1, vy_max);
+    vy2 = math_min_f32(vy2, vy_max);
+    vy3 = math_min_f32(vy3, vy_max);
+    vy4 = math_min_f32(vy4, vy_max);
+    vy5 = math_min_f32(vy5, vy_max);
+    vy6 = math_min_f32(vy6, vy_max);
+    vy7 = math_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va / vb;
+      vy = math_max_f32(vy, vy_min);
+      vy = math_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-minmax-wasm-x8.c b/src/f32-vbinary/gen/vdivc-minmax-wasm-x8.c
new file mode 100644
index 0000000..4704088
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-minmax-wasm-x8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_minmax_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+    float vy2 = va2 / vb;
+    float vy3 = va3 / vb;
+    float vy4 = va4 / vb;
+    float vy5 = va5 / vb;
+    float vy6 = va6 / vb;
+    float vy7 = va7 / vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+    vy4 = __builtin_wasm_max_f32(vy4, vy_min);
+    vy5 = __builtin_wasm_max_f32(vy5, vy_min);
+    vy6 = __builtin_wasm_max_f32(vy6, vy_min);
+    vy7 = __builtin_wasm_max_f32(vy7, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+    vy4 = __builtin_wasm_min_f32(vy4, vy_max);
+    vy5 = __builtin_wasm_min_f32(vy5, vy_max);
+    vy6 = __builtin_wasm_min_f32(vy6, vy_max);
+    vy7 = __builtin_wasm_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va / vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..e804a79
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x16.c
@@ -0,0 +1,92 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_div(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_div(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_div(vaCDEF, vb);
+
+
+    vy0123 = wasm_f32x4_max(vy0123, vy_min);
+    vy4567 = wasm_f32x4_max(vy4567, vy_min);
+    vy89AB = wasm_f32x4_max(vy89AB, vy_min);
+    vyCDEF = wasm_f32x4_max(vyCDEF, vy_min);
+
+    vy0123 = wasm_f32x4_min(vy0123, vy_max);
+    vy4567 = wasm_f32x4_min(vy4567, vy_max);
+    vy89AB = wasm_f32x4_min(vy89AB, vy_max);
+    vyCDEF = wasm_f32x4_min(vyCDEF, vy_max);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..85013c2
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x16.c
@@ -0,0 +1,105 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_div(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_div(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_div(vaCDEF, vb);
+
+
+    const v128_t vltmask0123 = wasm_f32x4_lt(vy0123, vy_min);
+    const v128_t vltmask4567 = wasm_f32x4_lt(vy4567, vy_min);
+    const v128_t vltmask89AB = wasm_f32x4_lt(vy89AB, vy_min);
+    const v128_t vltmaskCDEF = wasm_f32x4_lt(vyCDEF, vy_min);
+
+    const v128_t vngtmask0123 = wasm_f32x4_le(vy0123, vy_max);
+    vy0123 = wasm_v128_bitselect(vy_min, vy0123, vltmask0123);
+    const v128_t vngtmask4567 = wasm_f32x4_le(vy4567, vy_max);
+    vy4567 = wasm_v128_bitselect(vy_min, vy4567, vltmask4567);
+    const v128_t vngtmask89AB = wasm_f32x4_le(vy89AB, vy_max);
+    vy89AB = wasm_v128_bitselect(vy_min, vy89AB, vltmask89AB);
+    const v128_t vngtmaskCDEF = wasm_f32x4_le(vyCDEF, vy_max);
+    vyCDEF = wasm_v128_bitselect(vy_min, vyCDEF, vltmaskCDEF);
+
+    vy0123 = wasm_v128_bitselect(vy0123, vy_max, vngtmask0123);
+    vy4567 = wasm_v128_bitselect(vy4567, vy_max, vngtmask4567);
+    vy89AB = wasm_v128_bitselect(vy89AB, vy_max, vngtmask89AB);
+    vyCDEF = wasm_v128_bitselect(vyCDEF, vy_max, vngtmaskCDEF);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-scalar-x8.c b/src/f32-vbinary/gen/vdivc-relu-scalar-x8.c
new file mode 100644
index 0000000..33ac380
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-scalar-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+    float vy2 = va2 / vb;
+    float vy3 = va3 / vb;
+    float vy4 = va4 / vb;
+    float vy5 = va5 / vb;
+    float vy6 = va6 / vb;
+    float vy7 = va7 / vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+    vy4 = math_max_f32(vy4, 0.0f);
+    vy5 = math_max_f32(vy5, 0.0f);
+    vy6 = math_max_f32(vy6, 0.0f);
+    vy7 = math_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va / vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-wasm-x8.c b/src/f32-vbinary/gen/vdivc-relu-wasm-x8.c
new file mode 100644
index 0000000..4f03157
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-wasm-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+    float vy2 = va2 / vb;
+    float vy3 = va3 / vb;
+    float vy4 = va4 / vb;
+    float vy5 = va5 / vb;
+    float vy6 = va6 / vb;
+    float vy7 = va7 / vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+    vy4 = __builtin_wasm_max_f32(vy4, 0.0f);
+    vy5 = __builtin_wasm_max_f32(vy5, 0.0f);
+    vy6 = __builtin_wasm_max_f32(vy6, 0.0f);
+    vy7 = __builtin_wasm_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va / vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x16.c b/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x16.c
new file mode 100644
index 0000000..d079a7a
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-relu-wasmsimd-x16.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_relu_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_div(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_div(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_div(vaCDEF, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+    vy89AB = wasm_i32x4_max(vy89AB, vzero);
+    vyCDEF = wasm_i32x4_max(vyCDEF, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-scalar-x8.c b/src/f32-vbinary/gen/vdivc-scalar-x8.c
new file mode 100644
index 0000000..6289d55
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-scalar-x8.c
@@ -0,0 +1,72 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 / vb;
+    float vy1 = va1 / vb;
+    float vy2 = va2 / vb;
+    float vy3 = va3 / vb;
+    float vy4 = va4 / vb;
+    float vy5 = va5 / vb;
+    float vy6 = va6 / vb;
+    float vy7 = va7 / vb;
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va / vb;
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vdivc-wasmsimd-x16.c b/src/f32-vbinary/gen/vdivc-wasmsimd-x16.c
new file mode 100644
index 0000000..6a0bd31
--- /dev/null
+++ b/src/f32-vbinary/gen/vdivc-wasmsimd-x16.c
@@ -0,0 +1,77 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vdivc_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_div(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_div(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_div(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_div(vaCDEF, vb);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(va, vb);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmax-scalar-x8.c b/src/f32-vbinary/gen/vmax-scalar-x8.c
new file mode 100644
index 0000000..bc150f5
--- /dev/null
+++ b/src/f32-vbinary/gen/vmax-scalar-x8.c
@@ -0,0 +1,82 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmax_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = math_max_f32(va0, vb0);
+    float vy1 = math_max_f32(va1, vb1);
+    float vy2 = math_max_f32(va2, vb2);
+    float vy3 = math_max_f32(va3, vb3);
+    float vy4 = math_max_f32(va4, vb4);
+    float vy5 = math_max_f32(va5, vb5);
+    float vy6 = math_max_f32(va6, vb6);
+    float vy7 = math_max_f32(va7, vb7);
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = math_max_f32(va, vb);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmax-wasm-x8.c b/src/f32-vbinary/gen/vmax-wasm-x8.c
new file mode 100644
index 0000000..a0d3ea8
--- /dev/null
+++ b/src/f32-vbinary/gen/vmax-wasm-x8.c
@@ -0,0 +1,82 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmax_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = __builtin_wasm_max_f32(va0, vb0);
+    float vy1 = __builtin_wasm_max_f32(va1, vb1);
+    float vy2 = __builtin_wasm_max_f32(va2, vb2);
+    float vy3 = __builtin_wasm_max_f32(va3, vb3);
+    float vy4 = __builtin_wasm_max_f32(va4, vb4);
+    float vy5 = __builtin_wasm_max_f32(va5, vb5);
+    float vy6 = __builtin_wasm_max_f32(va6, vb6);
+    float vy7 = __builtin_wasm_max_f32(va7, vb7);
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = __builtin_wasm_max_f32(va, vb);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmax-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vmax-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..3a7781b
--- /dev/null
+++ b/src/f32-vbinary/gen/vmax-wasmsimd-arm-x16.c
@@ -0,0 +1,87 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmax_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_max(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_max(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_max(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_max(vaCDEF, vbCDEF);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_max(va, vb);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_max(va, vb);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmax-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vmax-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..8d1eabd
--- /dev/null
+++ b/src/f32-vbinary/gen/vmax-wasmsimd-x86-x16.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmax_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    const v128_t vm0123 = wasm_f32x4_le(va0123, vb0123);
+    const v128_t vm4567 = wasm_f32x4_le(va4567, vb4567);
+    const v128_t vm89AB = wasm_f32x4_le(va89AB, vb89AB);
+    const v128_t vmCDEF = wasm_f32x4_le(vaCDEF, vbCDEF);
+
+    v128_t vy0123 = wasm_v128_bitselect(vb0123, va0123, vm0123);
+    v128_t vy4567 = wasm_v128_bitselect(vb4567, va4567, vm4567);
+    v128_t vy89AB = wasm_v128_bitselect(vb89AB, va89AB, vm89AB);
+    v128_t vyCDEF = wasm_v128_bitselect(vbCDEF, vaCDEF, vmCDEF);
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    const v128_t vm = wasm_f32x4_le(va, vb);
+    v128_t vy = wasm_v128_bitselect(vb, va, vm);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    const v128_t vm = wasm_f32x4_le(va, vb);
+    v128_t vy = wasm_v128_bitselect(vb, va, vm);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmaxc-scalar-x8.c b/src/f32-vbinary/gen/vmaxc-scalar-x8.c
new file mode 100644
index 0000000..6af0e8f
--- /dev/null
+++ b/src/f32-vbinary/gen/vmaxc-scalar-x8.c
@@ -0,0 +1,72 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmaxc_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = math_max_f32(va0, vb);
+    float vy1 = math_max_f32(va1, vb);
+    float vy2 = math_max_f32(va2, vb);
+    float vy3 = math_max_f32(va3, vb);
+    float vy4 = math_max_f32(va4, vb);
+    float vy5 = math_max_f32(va5, vb);
+    float vy6 = math_max_f32(va6, vb);
+    float vy7 = math_max_f32(va7, vb);
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = math_max_f32(va, vb);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmaxc-wasm-x8.c b/src/f32-vbinary/gen/vmaxc-wasm-x8.c
new file mode 100644
index 0000000..95f619d
--- /dev/null
+++ b/src/f32-vbinary/gen/vmaxc-wasm-x8.c
@@ -0,0 +1,72 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmaxc_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = __builtin_wasm_max_f32(va0, vb);
+    float vy1 = __builtin_wasm_max_f32(va1, vb);
+    float vy2 = __builtin_wasm_max_f32(va2, vb);
+    float vy3 = __builtin_wasm_max_f32(va3, vb);
+    float vy4 = __builtin_wasm_max_f32(va4, vb);
+    float vy5 = __builtin_wasm_max_f32(va5, vb);
+    float vy6 = __builtin_wasm_max_f32(va6, vb);
+    float vy7 = __builtin_wasm_max_f32(va7, vb);
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = __builtin_wasm_max_f32(va, vb);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..d086deb
--- /dev/null
+++ b/src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x16.c
@@ -0,0 +1,77 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_max(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_max(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_max(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_max(vaCDEF, vb);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_max(va, vb);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_max(va, vb);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..e44deeb
--- /dev/null
+++ b/src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x16.c
@@ -0,0 +1,83 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vm0123 = wasm_f32x4_le(va0123, vb);
+    const v128_t vm4567 = wasm_f32x4_le(va4567, vb);
+    const v128_t vm89AB = wasm_f32x4_le(va89AB, vb);
+    const v128_t vmCDEF = wasm_f32x4_le(vaCDEF, vb);
+
+    v128_t vy0123 = wasm_v128_bitselect(vb, va0123, vm0123);
+    v128_t vy4567 = wasm_v128_bitselect(vb, va4567, vm4567);
+    v128_t vy89AB = wasm_v128_bitselect(vb, va89AB, vm89AB);
+    v128_t vyCDEF = wasm_v128_bitselect(vb, vaCDEF, vmCDEF);
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vm = wasm_f32x4_le(va, vb);
+    v128_t vy = wasm_v128_bitselect(vb, va, vm);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    const v128_t vm = wasm_f32x4_le(va, vb);
+    v128_t vy = wasm_v128_bitselect(vb, va, vm);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmin-scalar-x8.c b/src/f32-vbinary/gen/vmin-scalar-x8.c
new file mode 100644
index 0000000..88db537
--- /dev/null
+++ b/src/f32-vbinary/gen/vmin-scalar-x8.c
@@ -0,0 +1,82 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmin_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = math_min_f32(va0, vb0);
+    float vy1 = math_min_f32(va1, vb1);
+    float vy2 = math_min_f32(va2, vb2);
+    float vy3 = math_min_f32(va3, vb3);
+    float vy4 = math_min_f32(va4, vb4);
+    float vy5 = math_min_f32(va5, vb5);
+    float vy6 = math_min_f32(va6, vb6);
+    float vy7 = math_min_f32(va7, vb7);
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = math_min_f32(va, vb);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmin-wasm-x8.c b/src/f32-vbinary/gen/vmin-wasm-x8.c
new file mode 100644
index 0000000..21ab5fa
--- /dev/null
+++ b/src/f32-vbinary/gen/vmin-wasm-x8.c
@@ -0,0 +1,82 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmin_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = __builtin_wasm_min_f32(va0, vb0);
+    float vy1 = __builtin_wasm_min_f32(va1, vb1);
+    float vy2 = __builtin_wasm_min_f32(va2, vb2);
+    float vy3 = __builtin_wasm_min_f32(va3, vb3);
+    float vy4 = __builtin_wasm_min_f32(va4, vb4);
+    float vy5 = __builtin_wasm_min_f32(va5, vb5);
+    float vy6 = __builtin_wasm_min_f32(va6, vb6);
+    float vy7 = __builtin_wasm_min_f32(va7, vb7);
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = __builtin_wasm_min_f32(va, vb);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmin-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vmin-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..6a03415
--- /dev/null
+++ b/src/f32-vbinary/gen/vmin-wasmsimd-arm-x16.c
@@ -0,0 +1,87 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmin_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_min(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_min(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_min(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_min(vaCDEF, vbCDEF);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_min(va, vb);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_min(va, vb);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmin-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vmin-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..ff134d0
--- /dev/null
+++ b/src/f32-vbinary/gen/vmin-wasmsimd-x86-x16.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmin_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    const v128_t vm0123 = wasm_f32x4_lt(va0123, vb0123);
+    const v128_t vm4567 = wasm_f32x4_lt(va4567, vb4567);
+    const v128_t vm89AB = wasm_f32x4_lt(va89AB, vb89AB);
+    const v128_t vmCDEF = wasm_f32x4_lt(vaCDEF, vbCDEF);
+
+    v128_t vy0123 = wasm_v128_bitselect(va0123, vb0123, vm0123);
+    v128_t vy4567 = wasm_v128_bitselect(va4567, vb4567, vm4567);
+    v128_t vy89AB = wasm_v128_bitselect(va89AB, vb89AB, vm89AB);
+    v128_t vyCDEF = wasm_v128_bitselect(vaCDEF, vbCDEF, vmCDEF);
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    const v128_t vm = wasm_f32x4_lt(va, vb);
+    v128_t vy = wasm_v128_bitselect(va, vb, vm);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    const v128_t vm = wasm_f32x4_lt(va, vb);
+    v128_t vy = wasm_v128_bitselect(va, vb, vm);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vminc-scalar-x8.c b/src/f32-vbinary/gen/vminc-scalar-x8.c
new file mode 100644
index 0000000..ae94dfb
--- /dev/null
+++ b/src/f32-vbinary/gen/vminc-scalar-x8.c
@@ -0,0 +1,72 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vminc_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = math_min_f32(va0, vb);
+    float vy1 = math_min_f32(va1, vb);
+    float vy2 = math_min_f32(va2, vb);
+    float vy3 = math_min_f32(va3, vb);
+    float vy4 = math_min_f32(va4, vb);
+    float vy5 = math_min_f32(va5, vb);
+    float vy6 = math_min_f32(va6, vb);
+    float vy7 = math_min_f32(va7, vb);
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = math_min_f32(va, vb);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vminc-wasm-x8.c b/src/f32-vbinary/gen/vminc-wasm-x8.c
new file mode 100644
index 0000000..e7c3aa3
--- /dev/null
+++ b/src/f32-vbinary/gen/vminc-wasm-x8.c
@@ -0,0 +1,72 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vminc_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = __builtin_wasm_min_f32(va0, vb);
+    float vy1 = __builtin_wasm_min_f32(va1, vb);
+    float vy2 = __builtin_wasm_min_f32(va2, vb);
+    float vy3 = __builtin_wasm_min_f32(va3, vb);
+    float vy4 = __builtin_wasm_min_f32(va4, vb);
+    float vy5 = __builtin_wasm_min_f32(va5, vb);
+    float vy6 = __builtin_wasm_min_f32(va6, vb);
+    float vy7 = __builtin_wasm_min_f32(va7, vb);
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = __builtin_wasm_min_f32(va, vb);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vminc-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vminc-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..d2cc003
--- /dev/null
+++ b/src/f32-vbinary/gen/vminc-wasmsimd-arm-x16.c
@@ -0,0 +1,77 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vminc_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_min(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_min(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_min(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_min(vaCDEF, vb);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_min(va, vb);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_min(va, vb);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vminc-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vminc-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..3310d73
--- /dev/null
+++ b/src/f32-vbinary/gen/vminc-wasmsimd-x86-x16.c
@@ -0,0 +1,83 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vminc_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vm0123 = wasm_f32x4_lt(va0123, vb);
+    const v128_t vm4567 = wasm_f32x4_lt(va4567, vb);
+    const v128_t vm89AB = wasm_f32x4_lt(va89AB, vb);
+    const v128_t vmCDEF = wasm_f32x4_lt(vaCDEF, vb);
+
+    v128_t vy0123 = wasm_v128_bitselect(va0123, vb, vm0123);
+    v128_t vy4567 = wasm_v128_bitselect(va4567, vb, vm4567);
+    v128_t vy89AB = wasm_v128_bitselect(va89AB, vb, vm89AB);
+    v128_t vyCDEF = wasm_v128_bitselect(vaCDEF, vb, vmCDEF);
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vm = wasm_f32x4_lt(va, vb);
+    v128_t vy = wasm_v128_bitselect(va, vb, vm);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    const v128_t vm = wasm_f32x4_lt(va, vb);
+    v128_t vy = wasm_v128_bitselect(va, vb, vm);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-minmax-scalar-x8.c b/src/f32-vbinary/gen/vmul-minmax-scalar-x8.c
new file mode 100644
index 0000000..9c6c727
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-minmax-scalar-x8.c
@@ -0,0 +1,103 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_minmax_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+    float vy2 = va2 * vb2;
+    float vy3 = va3 * vb3;
+    float vy4 = va4 * vb4;
+    float vy5 = va5 * vb5;
+    float vy6 = va6 * vb6;
+    float vy7 = va7 * vb7;
+
+
+    vy0 = math_max_f32(vy0, vy_min);
+    vy1 = math_max_f32(vy1, vy_min);
+    vy2 = math_max_f32(vy2, vy_min);
+    vy3 = math_max_f32(vy3, vy_min);
+    vy4 = math_max_f32(vy4, vy_min);
+    vy5 = math_max_f32(vy5, vy_min);
+    vy6 = math_max_f32(vy6, vy_min);
+    vy7 = math_max_f32(vy7, vy_min);
+
+    vy0 = math_min_f32(vy0, vy_max);
+    vy1 = math_min_f32(vy1, vy_max);
+    vy2 = math_min_f32(vy2, vy_max);
+    vy3 = math_min_f32(vy3, vy_max);
+    vy4 = math_min_f32(vy4, vy_max);
+    vy5 = math_min_f32(vy5, vy_max);
+    vy6 = math_min_f32(vy6, vy_max);
+    vy7 = math_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va * vb;
+      vy = math_max_f32(vy, vy_min);
+      vy = math_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-minmax-wasm-x8.c b/src/f32-vbinary/gen/vmul-minmax-wasm-x8.c
new file mode 100644
index 0000000..05ffa24
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-minmax-wasm-x8.c
@@ -0,0 +1,103 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_minmax_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+    float vy2 = va2 * vb2;
+    float vy3 = va3 * vb3;
+    float vy4 = va4 * vb4;
+    float vy5 = va5 * vb5;
+    float vy6 = va6 * vb6;
+    float vy7 = va7 * vb7;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+    vy4 = __builtin_wasm_max_f32(vy4, vy_min);
+    vy5 = __builtin_wasm_max_f32(vy5, vy_min);
+    vy6 = __builtin_wasm_max_f32(vy6, vy_min);
+    vy7 = __builtin_wasm_max_f32(vy7, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+    vy4 = __builtin_wasm_min_f32(vy4, vy_max);
+    vy5 = __builtin_wasm_min_f32(vy5, vy_max);
+    vy6 = __builtin_wasm_min_f32(vy6, vy_max);
+    vy7 = __builtin_wasm_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va * vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-minmax-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vmul-minmax-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..c4d34d4
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-minmax-wasmsimd-arm-x16.c
@@ -0,0 +1,102 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_mul(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_mul(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_mul(vaCDEF, vbCDEF);
+
+
+    vy0123 = wasm_f32x4_max(vy0123, vy_min);
+    vy4567 = wasm_f32x4_max(vy4567, vy_min);
+    vy89AB = wasm_f32x4_max(vy89AB, vy_min);
+    vyCDEF = wasm_f32x4_max(vyCDEF, vy_min);
+
+    vy0123 = wasm_f32x4_min(vy0123, vy_max);
+    vy4567 = wasm_f32x4_min(vy4567, vy_max);
+    vy89AB = wasm_f32x4_min(vy89AB, vy_max);
+    vyCDEF = wasm_f32x4_min(vyCDEF, vy_max);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-minmax-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vmul-minmax-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..385a0d8
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-minmax-wasmsimd-x86-x16.c
@@ -0,0 +1,115 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_mul(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_mul(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_mul(vaCDEF, vbCDEF);
+
+
+    const v128_t vltmask0123 = wasm_f32x4_lt(vy0123, vy_min);
+    const v128_t vltmask4567 = wasm_f32x4_lt(vy4567, vy_min);
+    const v128_t vltmask89AB = wasm_f32x4_lt(vy89AB, vy_min);
+    const v128_t vltmaskCDEF = wasm_f32x4_lt(vyCDEF, vy_min);
+
+    const v128_t vngtmask0123 = wasm_f32x4_le(vy0123, vy_max);
+    vy0123 = wasm_v128_bitselect(vy_min, vy0123, vltmask0123);
+    const v128_t vngtmask4567 = wasm_f32x4_le(vy4567, vy_max);
+    vy4567 = wasm_v128_bitselect(vy_min, vy4567, vltmask4567);
+    const v128_t vngtmask89AB = wasm_f32x4_le(vy89AB, vy_max);
+    vy89AB = wasm_v128_bitselect(vy_min, vy89AB, vltmask89AB);
+    const v128_t vngtmaskCDEF = wasm_f32x4_le(vyCDEF, vy_max);
+    vyCDEF = wasm_v128_bitselect(vy_min, vyCDEF, vltmaskCDEF);
+
+    vy0123 = wasm_v128_bitselect(vy0123, vy_max, vngtmask0123);
+    vy4567 = wasm_v128_bitselect(vy4567, vy_max, vngtmask4567);
+    vy89AB = wasm_v128_bitselect(vy89AB, vy_max, vngtmask89AB);
+    vyCDEF = wasm_v128_bitselect(vyCDEF, vy_max, vngtmaskCDEF);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-scalar-x8.c b/src/f32-vbinary/gen/vmul-relu-scalar-x8.c
new file mode 100644
index 0000000..c2caac8
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-scalar-x8.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+    float vy2 = va2 * vb2;
+    float vy3 = va3 * vb3;
+    float vy4 = va4 * vb4;
+    float vy5 = va5 * vb5;
+    float vy6 = va6 * vb6;
+    float vy7 = va7 * vb7;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+    vy4 = math_max_f32(vy4, 0.0f);
+    vy5 = math_max_f32(vy5, 0.0f);
+    vy6 = math_max_f32(vy6, 0.0f);
+    vy7 = math_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va * vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-wasm-x8.c b/src/f32-vbinary/gen/vmul-relu-wasm-x8.c
new file mode 100644
index 0000000..6d3943e
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-wasm-x8.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+    float vy2 = va2 * vb2;
+    float vy3 = va3 * vb3;
+    float vy4 = va4 * vb4;
+    float vy5 = va5 * vb5;
+    float vy6 = va6 * vb6;
+    float vy7 = va7 * vb7;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+    vy4 = __builtin_wasm_max_f32(vy4, 0.0f);
+    vy5 = __builtin_wasm_max_f32(vy5, 0.0f);
+    vy6 = __builtin_wasm_max_f32(vy6, 0.0f);
+    vy7 = __builtin_wasm_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va * vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-relu-wasmsimd-x16.c b/src/f32-vbinary/gen/vmul-relu-wasmsimd-x16.c
new file mode 100644
index 0000000..75f2021
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-relu-wasmsimd-x16.c
@@ -0,0 +1,94 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_relu_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_mul(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_mul(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_mul(vaCDEF, vbCDEF);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+    vy89AB = wasm_i32x4_max(vy89AB, vzero);
+    vyCDEF = wasm_i32x4_max(vyCDEF, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-scalar-x8.c b/src/f32-vbinary/gen/vmul-scalar-x8.c
new file mode 100644
index 0000000..e28fec4
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-scalar-x8.c
@@ -0,0 +1,82 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+    float vy2 = va2 * vb2;
+    float vy3 = va3 * vb3;
+    float vy4 = va4 * vb4;
+    float vy5 = va5 * vb5;
+    float vy6 = va6 * vb6;
+    float vy7 = va7 * vb7;
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va * vb;
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmul-wasmsimd-x16.c b/src/f32-vbinary/gen/vmul-wasmsimd-x16.c
new file mode 100644
index 0000000..e6daa7e
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-wasmsimd-x16.c
@@ -0,0 +1,87 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_mul(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_mul(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_mul(vaCDEF, vbCDEF);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-minmax-scalar-x8.c b/src/f32-vbinary/gen/vmulc-minmax-scalar-x8.c
new file mode 100644
index 0000000..65bbd9b
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-minmax-scalar-x8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_minmax_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+    float vy2 = va2 * vb;
+    float vy3 = va3 * vb;
+    float vy4 = va4 * vb;
+    float vy5 = va5 * vb;
+    float vy6 = va6 * vb;
+    float vy7 = va7 * vb;
+
+
+    vy0 = math_max_f32(vy0, vy_min);
+    vy1 = math_max_f32(vy1, vy_min);
+    vy2 = math_max_f32(vy2, vy_min);
+    vy3 = math_max_f32(vy3, vy_min);
+    vy4 = math_max_f32(vy4, vy_min);
+    vy5 = math_max_f32(vy5, vy_min);
+    vy6 = math_max_f32(vy6, vy_min);
+    vy7 = math_max_f32(vy7, vy_min);
+
+    vy0 = math_min_f32(vy0, vy_max);
+    vy1 = math_min_f32(vy1, vy_max);
+    vy2 = math_min_f32(vy2, vy_max);
+    vy3 = math_min_f32(vy3, vy_max);
+    vy4 = math_min_f32(vy4, vy_max);
+    vy5 = math_min_f32(vy5, vy_max);
+    vy6 = math_min_f32(vy6, vy_max);
+    vy7 = math_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va * vb;
+      vy = math_max_f32(vy, vy_min);
+      vy = math_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-minmax-wasm-x8.c b/src/f32-vbinary/gen/vmulc-minmax-wasm-x8.c
new file mode 100644
index 0000000..110a941
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-minmax-wasm-x8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_minmax_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+    float vy2 = va2 * vb;
+    float vy3 = va3 * vb;
+    float vy4 = va4 * vb;
+    float vy5 = va5 * vb;
+    float vy6 = va6 * vb;
+    float vy7 = va7 * vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+    vy4 = __builtin_wasm_max_f32(vy4, vy_min);
+    vy5 = __builtin_wasm_max_f32(vy5, vy_min);
+    vy6 = __builtin_wasm_max_f32(vy6, vy_min);
+    vy7 = __builtin_wasm_max_f32(vy7, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+    vy4 = __builtin_wasm_min_f32(vy4, vy_max);
+    vy5 = __builtin_wasm_min_f32(vy5, vy_max);
+    vy6 = __builtin_wasm_min_f32(vy6, vy_max);
+    vy7 = __builtin_wasm_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va * vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..0366f00
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x16.c
@@ -0,0 +1,92 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_mul(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_mul(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_mul(vaCDEF, vb);
+
+
+    vy0123 = wasm_f32x4_max(vy0123, vy_min);
+    vy4567 = wasm_f32x4_max(vy4567, vy_min);
+    vy89AB = wasm_f32x4_max(vy89AB, vy_min);
+    vyCDEF = wasm_f32x4_max(vyCDEF, vy_min);
+
+    vy0123 = wasm_f32x4_min(vy0123, vy_max);
+    vy4567 = wasm_f32x4_min(vy4567, vy_max);
+    vy89AB = wasm_f32x4_min(vy89AB, vy_max);
+    vyCDEF = wasm_f32x4_min(vyCDEF, vy_max);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..ff7f5b2
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x16.c
@@ -0,0 +1,105 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_mul(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_mul(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_mul(vaCDEF, vb);
+
+
+    const v128_t vltmask0123 = wasm_f32x4_lt(vy0123, vy_min);
+    const v128_t vltmask4567 = wasm_f32x4_lt(vy4567, vy_min);
+    const v128_t vltmask89AB = wasm_f32x4_lt(vy89AB, vy_min);
+    const v128_t vltmaskCDEF = wasm_f32x4_lt(vyCDEF, vy_min);
+
+    const v128_t vngtmask0123 = wasm_f32x4_le(vy0123, vy_max);
+    vy0123 = wasm_v128_bitselect(vy_min, vy0123, vltmask0123);
+    const v128_t vngtmask4567 = wasm_f32x4_le(vy4567, vy_max);
+    vy4567 = wasm_v128_bitselect(vy_min, vy4567, vltmask4567);
+    const v128_t vngtmask89AB = wasm_f32x4_le(vy89AB, vy_max);
+    vy89AB = wasm_v128_bitselect(vy_min, vy89AB, vltmask89AB);
+    const v128_t vngtmaskCDEF = wasm_f32x4_le(vyCDEF, vy_max);
+    vyCDEF = wasm_v128_bitselect(vy_min, vyCDEF, vltmaskCDEF);
+
+    vy0123 = wasm_v128_bitselect(vy0123, vy_max, vngtmask0123);
+    vy4567 = wasm_v128_bitselect(vy4567, vy_max, vngtmask4567);
+    vy89AB = wasm_v128_bitselect(vy89AB, vy_max, vngtmask89AB);
+    vyCDEF = wasm_v128_bitselect(vyCDEF, vy_max, vngtmaskCDEF);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-scalar-x8.c b/src/f32-vbinary/gen/vmulc-relu-scalar-x8.c
new file mode 100644
index 0000000..117c60e
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-scalar-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+    float vy2 = va2 * vb;
+    float vy3 = va3 * vb;
+    float vy4 = va4 * vb;
+    float vy5 = va5 * vb;
+    float vy6 = va6 * vb;
+    float vy7 = va7 * vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+    vy4 = math_max_f32(vy4, 0.0f);
+    vy5 = math_max_f32(vy5, 0.0f);
+    vy6 = math_max_f32(vy6, 0.0f);
+    vy7 = math_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va * vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-wasm-x8.c b/src/f32-vbinary/gen/vmulc-relu-wasm-x8.c
new file mode 100644
index 0000000..37af2a5
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-wasm-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+    float vy2 = va2 * vb;
+    float vy3 = va3 * vb;
+    float vy4 = va4 * vb;
+    float vy5 = va5 * vb;
+    float vy6 = va6 * vb;
+    float vy7 = va7 * vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+    vy4 = __builtin_wasm_max_f32(vy4, 0.0f);
+    vy5 = __builtin_wasm_max_f32(vy5, 0.0f);
+    vy6 = __builtin_wasm_max_f32(vy6, 0.0f);
+    vy7 = __builtin_wasm_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va * vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x16.c b/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x16.c
new file mode 100644
index 0000000..4c11cc4
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-relu-wasmsimd-x16.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_relu_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_mul(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_mul(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_mul(vaCDEF, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+    vy89AB = wasm_i32x4_max(vy89AB, vzero);
+    vyCDEF = wasm_i32x4_max(vyCDEF, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-scalar-x8.c b/src/f32-vbinary/gen/vmulc-scalar-x8.c
new file mode 100644
index 0000000..e78a29a
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-scalar-x8.c
@@ -0,0 +1,72 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+    float vy2 = va2 * vb;
+    float vy3 = va3 * vb;
+    float vy4 = va4 * vb;
+    float vy5 = va5 * vb;
+    float vy6 = va6 * vb;
+    float vy7 = va7 * vb;
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va * vb;
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vmulc-wasmsimd-x16.c b/src/f32-vbinary/gen/vmulc-wasmsimd-x16.c
new file mode 100644
index 0000000..e1dfe06
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-wasmsimd-x16.c
@@ -0,0 +1,77 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_mul(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_mul(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_mul(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_mul(vaCDEF, vb);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_mul(va, vb);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-minmax-scalar-x8.c b/src/f32-vbinary/gen/vrdivc-minmax-scalar-x8.c
new file mode 100644
index 0000000..bcf5eb2
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-minmax-scalar-x8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_minmax_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+    float vy2 = vb / va2;
+    float vy3 = vb / va3;
+    float vy4 = vb / va4;
+    float vy5 = vb / va5;
+    float vy6 = vb / va6;
+    float vy7 = vb / va7;
+
+
+    vy0 = math_max_f32(vy0, vy_min);
+    vy1 = math_max_f32(vy1, vy_min);
+    vy2 = math_max_f32(vy2, vy_min);
+    vy3 = math_max_f32(vy3, vy_min);
+    vy4 = math_max_f32(vy4, vy_min);
+    vy5 = math_max_f32(vy5, vy_min);
+    vy6 = math_max_f32(vy6, vy_min);
+    vy7 = math_max_f32(vy7, vy_min);
+
+    vy0 = math_min_f32(vy0, vy_max);
+    vy1 = math_min_f32(vy1, vy_max);
+    vy2 = math_min_f32(vy2, vy_max);
+    vy3 = math_min_f32(vy3, vy_max);
+    vy4 = math_min_f32(vy4, vy_max);
+    vy5 = math_min_f32(vy5, vy_max);
+    vy6 = math_min_f32(vy6, vy_max);
+    vy7 = math_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb / va;
+      vy = math_max_f32(vy, vy_min);
+      vy = math_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-minmax-wasm-x8.c b/src/f32-vbinary/gen/vrdivc-minmax-wasm-x8.c
new file mode 100644
index 0000000..51cb162
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-minmax-wasm-x8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_minmax_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+    float vy2 = vb / va2;
+    float vy3 = vb / va3;
+    float vy4 = vb / va4;
+    float vy5 = vb / va5;
+    float vy6 = vb / va6;
+    float vy7 = vb / va7;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+    vy4 = __builtin_wasm_max_f32(vy4, vy_min);
+    vy5 = __builtin_wasm_max_f32(vy5, vy_min);
+    vy6 = __builtin_wasm_max_f32(vy6, vy_min);
+    vy7 = __builtin_wasm_max_f32(vy7, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+    vy4 = __builtin_wasm_min_f32(vy4, vy_max);
+    vy5 = __builtin_wasm_min_f32(vy5, vy_max);
+    vy6 = __builtin_wasm_min_f32(vy6, vy_max);
+    vy7 = __builtin_wasm_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb / va;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..c5464ae
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x16.c
@@ -0,0 +1,92 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_div(vb, va0123);
+    v128_t vy4567 = wasm_f32x4_div(vb, va4567);
+    v128_t vy89AB = wasm_f32x4_div(vb, va89AB);
+    v128_t vyCDEF = wasm_f32x4_div(vb, vaCDEF);
+
+
+    vy0123 = wasm_f32x4_max(vy0123, vy_min);
+    vy4567 = wasm_f32x4_max(vy4567, vy_min);
+    vy89AB = wasm_f32x4_max(vy89AB, vy_min);
+    vyCDEF = wasm_f32x4_max(vyCDEF, vy_min);
+
+    vy0123 = wasm_f32x4_min(vy0123, vy_max);
+    vy4567 = wasm_f32x4_min(vy4567, vy_max);
+    vy89AB = wasm_f32x4_min(vy89AB, vy_max);
+    vyCDEF = wasm_f32x4_min(vyCDEF, vy_max);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..69d6ee2
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x16.c
@@ -0,0 +1,105 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_div(vb, va0123);
+    v128_t vy4567 = wasm_f32x4_div(vb, va4567);
+    v128_t vy89AB = wasm_f32x4_div(vb, va89AB);
+    v128_t vyCDEF = wasm_f32x4_div(vb, vaCDEF);
+
+
+    const v128_t vltmask0123 = wasm_f32x4_lt(vy0123, vy_min);
+    const v128_t vltmask4567 = wasm_f32x4_lt(vy4567, vy_min);
+    const v128_t vltmask89AB = wasm_f32x4_lt(vy89AB, vy_min);
+    const v128_t vltmaskCDEF = wasm_f32x4_lt(vyCDEF, vy_min);
+
+    const v128_t vngtmask0123 = wasm_f32x4_le(vy0123, vy_max);
+    vy0123 = wasm_v128_bitselect(vy_min, vy0123, vltmask0123);
+    const v128_t vngtmask4567 = wasm_f32x4_le(vy4567, vy_max);
+    vy4567 = wasm_v128_bitselect(vy_min, vy4567, vltmask4567);
+    const v128_t vngtmask89AB = wasm_f32x4_le(vy89AB, vy_max);
+    vy89AB = wasm_v128_bitselect(vy_min, vy89AB, vltmask89AB);
+    const v128_t vngtmaskCDEF = wasm_f32x4_le(vyCDEF, vy_max);
+    vyCDEF = wasm_v128_bitselect(vy_min, vyCDEF, vltmaskCDEF);
+
+    vy0123 = wasm_v128_bitselect(vy0123, vy_max, vngtmask0123);
+    vy4567 = wasm_v128_bitselect(vy4567, vy_max, vngtmask4567);
+    vy89AB = wasm_v128_bitselect(vy89AB, vy_max, vngtmask89AB);
+    vyCDEF = wasm_v128_bitselect(vyCDEF, vy_max, vngtmaskCDEF);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-scalar-x8.c b/src/f32-vbinary/gen/vrdivc-relu-scalar-x8.c
new file mode 100644
index 0000000..d1da94d
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-scalar-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+    float vy2 = vb / va2;
+    float vy3 = vb / va3;
+    float vy4 = vb / va4;
+    float vy5 = vb / va5;
+    float vy6 = vb / va6;
+    float vy7 = vb / va7;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+    vy4 = math_max_f32(vy4, 0.0f);
+    vy5 = math_max_f32(vy5, 0.0f);
+    vy6 = math_max_f32(vy6, 0.0f);
+    vy7 = math_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb / va;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-wasm-x8.c b/src/f32-vbinary/gen/vrdivc-relu-wasm-x8.c
new file mode 100644
index 0000000..bb4983e
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-wasm-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+    float vy2 = vb / va2;
+    float vy3 = vb / va3;
+    float vy4 = vb / va4;
+    float vy5 = vb / va5;
+    float vy6 = vb / va6;
+    float vy7 = vb / va7;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+    vy4 = __builtin_wasm_max_f32(vy4, 0.0f);
+    vy5 = __builtin_wasm_max_f32(vy5, 0.0f);
+    vy6 = __builtin_wasm_max_f32(vy6, 0.0f);
+    vy7 = __builtin_wasm_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb / va;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x16.c b/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x16.c
new file mode 100644
index 0000000..fdb3e68
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-relu-wasmsimd-x16.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_relu_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_div(vb, va0123);
+    v128_t vy4567 = wasm_f32x4_div(vb, va4567);
+    v128_t vy89AB = wasm_f32x4_div(vb, va89AB);
+    v128_t vyCDEF = wasm_f32x4_div(vb, vaCDEF);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+    vy89AB = wasm_i32x4_max(vy89AB, vzero);
+    vyCDEF = wasm_i32x4_max(vyCDEF, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-scalar-x8.c b/src/f32-vbinary/gen/vrdivc-scalar-x8.c
new file mode 100644
index 0000000..dd491db
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-scalar-x8.c
@@ -0,0 +1,72 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = vb / va0;
+    float vy1 = vb / va1;
+    float vy2 = vb / va2;
+    float vy3 = vb / va3;
+    float vy4 = vb / va4;
+    float vy5 = vb / va5;
+    float vy6 = vb / va6;
+    float vy7 = vb / va7;
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb / va;
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrdivc-wasmsimd-x16.c b/src/f32-vbinary/gen/vrdivc-wasmsimd-x16.c
new file mode 100644
index 0000000..893c366
--- /dev/null
+++ b/src/f32-vbinary/gen/vrdivc-wasmsimd-x16.c
@@ -0,0 +1,77 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrdivc_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_div(vb, va0123);
+    v128_t vy4567 = wasm_f32x4_div(vb, va4567);
+    v128_t vy89AB = wasm_f32x4_div(vb, va89AB);
+    v128_t vyCDEF = wasm_f32x4_div(vb, vaCDEF);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_div(vb, va);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-minmax-scalar-x8.c b/src/f32-vbinary/gen/vrsubc-minmax-scalar-x8.c
new file mode 100644
index 0000000..bfa0147
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-minmax-scalar-x8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_minmax_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+    float vy2 = vb - va2;
+    float vy3 = vb - va3;
+    float vy4 = vb - va4;
+    float vy5 = vb - va5;
+    float vy6 = vb - va6;
+    float vy7 = vb - va7;
+
+
+    vy0 = math_max_f32(vy0, vy_min);
+    vy1 = math_max_f32(vy1, vy_min);
+    vy2 = math_max_f32(vy2, vy_min);
+    vy3 = math_max_f32(vy3, vy_min);
+    vy4 = math_max_f32(vy4, vy_min);
+    vy5 = math_max_f32(vy5, vy_min);
+    vy6 = math_max_f32(vy6, vy_min);
+    vy7 = math_max_f32(vy7, vy_min);
+
+    vy0 = math_min_f32(vy0, vy_max);
+    vy1 = math_min_f32(vy1, vy_max);
+    vy2 = math_min_f32(vy2, vy_max);
+    vy3 = math_min_f32(vy3, vy_max);
+    vy4 = math_min_f32(vy4, vy_max);
+    vy5 = math_min_f32(vy5, vy_max);
+    vy6 = math_min_f32(vy6, vy_max);
+    vy7 = math_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb - va;
+      vy = math_max_f32(vy, vy_min);
+      vy = math_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-minmax-wasm-x8.c b/src/f32-vbinary/gen/vrsubc-minmax-wasm-x8.c
new file mode 100644
index 0000000..f2f7a0a
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-minmax-wasm-x8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_minmax_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+    float vy2 = vb - va2;
+    float vy3 = vb - va3;
+    float vy4 = vb - va4;
+    float vy5 = vb - va5;
+    float vy6 = vb - va6;
+    float vy7 = vb - va7;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+    vy4 = __builtin_wasm_max_f32(vy4, vy_min);
+    vy5 = __builtin_wasm_max_f32(vy5, vy_min);
+    vy6 = __builtin_wasm_max_f32(vy6, vy_min);
+    vy7 = __builtin_wasm_max_f32(vy7, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+    vy4 = __builtin_wasm_min_f32(vy4, vy_max);
+    vy5 = __builtin_wasm_min_f32(vy5, vy_max);
+    vy6 = __builtin_wasm_min_f32(vy6, vy_max);
+    vy7 = __builtin_wasm_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb - va;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..9cb0e04
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x16.c
@@ -0,0 +1,92 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(vb, va0123);
+    v128_t vy4567 = wasm_f32x4_sub(vb, va4567);
+    v128_t vy89AB = wasm_f32x4_sub(vb, va89AB);
+    v128_t vyCDEF = wasm_f32x4_sub(vb, vaCDEF);
+
+
+    vy0123 = wasm_f32x4_max(vy0123, vy_min);
+    vy4567 = wasm_f32x4_max(vy4567, vy_min);
+    vy89AB = wasm_f32x4_max(vy89AB, vy_min);
+    vyCDEF = wasm_f32x4_max(vyCDEF, vy_min);
+
+    vy0123 = wasm_f32x4_min(vy0123, vy_max);
+    vy4567 = wasm_f32x4_min(vy4567, vy_max);
+    vy89AB = wasm_f32x4_min(vy89AB, vy_max);
+    vyCDEF = wasm_f32x4_min(vyCDEF, vy_max);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..1bf4e16
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x16.c
@@ -0,0 +1,105 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(vb, va0123);
+    v128_t vy4567 = wasm_f32x4_sub(vb, va4567);
+    v128_t vy89AB = wasm_f32x4_sub(vb, va89AB);
+    v128_t vyCDEF = wasm_f32x4_sub(vb, vaCDEF);
+
+
+    const v128_t vltmask0123 = wasm_f32x4_lt(vy0123, vy_min);
+    const v128_t vltmask4567 = wasm_f32x4_lt(vy4567, vy_min);
+    const v128_t vltmask89AB = wasm_f32x4_lt(vy89AB, vy_min);
+    const v128_t vltmaskCDEF = wasm_f32x4_lt(vyCDEF, vy_min);
+
+    const v128_t vngtmask0123 = wasm_f32x4_le(vy0123, vy_max);
+    vy0123 = wasm_v128_bitselect(vy_min, vy0123, vltmask0123);
+    const v128_t vngtmask4567 = wasm_f32x4_le(vy4567, vy_max);
+    vy4567 = wasm_v128_bitselect(vy_min, vy4567, vltmask4567);
+    const v128_t vngtmask89AB = wasm_f32x4_le(vy89AB, vy_max);
+    vy89AB = wasm_v128_bitselect(vy_min, vy89AB, vltmask89AB);
+    const v128_t vngtmaskCDEF = wasm_f32x4_le(vyCDEF, vy_max);
+    vyCDEF = wasm_v128_bitselect(vy_min, vyCDEF, vltmaskCDEF);
+
+    vy0123 = wasm_v128_bitselect(vy0123, vy_max, vngtmask0123);
+    vy4567 = wasm_v128_bitselect(vy4567, vy_max, vngtmask4567);
+    vy89AB = wasm_v128_bitselect(vy89AB, vy_max, vngtmask89AB);
+    vyCDEF = wasm_v128_bitselect(vyCDEF, vy_max, vngtmaskCDEF);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-scalar-x8.c b/src/f32-vbinary/gen/vrsubc-relu-scalar-x8.c
new file mode 100644
index 0000000..6ffe1c1
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-scalar-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+    float vy2 = vb - va2;
+    float vy3 = vb - va3;
+    float vy4 = vb - va4;
+    float vy5 = vb - va5;
+    float vy6 = vb - va6;
+    float vy7 = vb - va7;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+    vy4 = math_max_f32(vy4, 0.0f);
+    vy5 = math_max_f32(vy5, 0.0f);
+    vy6 = math_max_f32(vy6, 0.0f);
+    vy7 = math_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb - va;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-wasm-x8.c b/src/f32-vbinary/gen/vrsubc-relu-wasm-x8.c
new file mode 100644
index 0000000..c42630f
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-wasm-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+    float vy2 = vb - va2;
+    float vy3 = vb - va3;
+    float vy4 = vb - va4;
+    float vy5 = vb - va5;
+    float vy6 = vb - va6;
+    float vy7 = vb - va7;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+    vy4 = __builtin_wasm_max_f32(vy4, 0.0f);
+    vy5 = __builtin_wasm_max_f32(vy5, 0.0f);
+    vy6 = __builtin_wasm_max_f32(vy6, 0.0f);
+    vy7 = __builtin_wasm_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb - va;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x16.c b/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x16.c
new file mode 100644
index 0000000..4895371
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-relu-wasmsimd-x16.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_relu_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(vb, va0123);
+    v128_t vy4567 = wasm_f32x4_sub(vb, va4567);
+    v128_t vy89AB = wasm_f32x4_sub(vb, va89AB);
+    v128_t vyCDEF = wasm_f32x4_sub(vb, vaCDEF);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+    vy89AB = wasm_i32x4_max(vy89AB, vzero);
+    vyCDEF = wasm_i32x4_max(vyCDEF, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-scalar-x8.c b/src/f32-vbinary/gen/vrsubc-scalar-x8.c
new file mode 100644
index 0000000..df61ad0
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-scalar-x8.c
@@ -0,0 +1,72 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+    float vy2 = vb - va2;
+    float vy3 = vb - va3;
+    float vy4 = vb - va4;
+    float vy5 = vb - va5;
+    float vy6 = vb - va6;
+    float vy7 = vb - va7;
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb - va;
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vrsubc-wasmsimd-x16.c b/src/f32-vbinary/gen/vrsubc-wasmsimd-x16.c
new file mode 100644
index 0000000..5b3bd98
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-wasmsimd-x16.c
@@ -0,0 +1,77 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(vb, va0123);
+    v128_t vy4567 = wasm_f32x4_sub(vb, va4567);
+    v128_t vy89AB = wasm_f32x4_sub(vb, va89AB);
+    v128_t vyCDEF = wasm_f32x4_sub(vb, vaCDEF);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(vb, va);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsqrdiff-scalar-x8.c b/src/f32-vbinary/gen/vsqrdiff-scalar-x8.c
new file mode 100644
index 0000000..c2adc61
--- /dev/null
+++ b/src/f32-vbinary/gen/vsqrdiff-scalar-x8.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsqrdiff_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+    float vy2 = va2 - vb2;
+    float vy3 = va3 - vb3;
+    float vy4 = va4 - vb4;
+    float vy5 = va5 - vb5;
+    float vy6 = va6 - vb6;
+    float vy7 = va7 - vb7;
+
+    vy0 = vy0 * vy0;
+    vy1 = vy1 * vy1;
+    vy2 = vy2 * vy2;
+    vy3 = vy3 * vy3;
+    vy4 = vy4 * vy4;
+    vy5 = vy5 * vy5;
+    vy6 = vy6 * vy6;
+    vy7 = vy7 * vy7;
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va - vb;
+      vy = vy * vy;
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsqrdiff-wasmsimd-x16.c b/src/f32-vbinary/gen/vsqrdiff-wasmsimd-x16.c
new file mode 100644
index 0000000..298e392
--- /dev/null
+++ b/src/f32-vbinary/gen/vsqrdiff-wasmsimd-x16.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsqrdiff_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_sub(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_sub(vaCDEF, vbCDEF);
+
+    vy0123 = wasm_f32x4_mul(vy0123, vy0123);
+    vy4567 = wasm_f32x4_mul(vy4567, vy4567);
+    vy89AB = wasm_f32x4_mul(vy89AB, vy89AB);
+    vyCDEF = wasm_f32x4_mul(vyCDEF, vyCDEF);
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+    vy = wasm_f32x4_mul(vy, vy);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+    vy = wasm_f32x4_mul(vy, vy);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsqrdiffc-scalar-x8.c b/src/f32-vbinary/gen/vsqrdiffc-scalar-x8.c
new file mode 100644
index 0000000..9ab1c01
--- /dev/null
+++ b/src/f32-vbinary/gen/vsqrdiffc-scalar-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsqrdiffc_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+    float vy2 = va2 - vb;
+    float vy3 = va3 - vb;
+    float vy4 = va4 - vb;
+    float vy5 = va5 - vb;
+    float vy6 = va6 - vb;
+    float vy7 = va7 - vb;
+
+    vy0 = vy0 * vy0;
+    vy1 = vy1 * vy1;
+    vy2 = vy2 * vy2;
+    vy3 = vy3 * vy3;
+    vy4 = vy4 * vy4;
+    vy5 = vy5 * vy5;
+    vy6 = vy6 * vy6;
+    vy7 = vy7 * vy7;
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va - vb;
+      vy = vy * vy;
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x16.c b/src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x16.c
new file mode 100644
index 0000000..ba4671d
--- /dev/null
+++ b/src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x16.c
@@ -0,0 +1,83 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_sub(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_sub(vaCDEF, vb);
+
+    vy0123 = wasm_f32x4_mul(vy0123, vy0123);
+    vy4567 = wasm_f32x4_mul(vy4567, vy4567);
+    vy89AB = wasm_f32x4_mul(vy89AB, vy89AB);
+    vyCDEF = wasm_f32x4_mul(vyCDEF, vyCDEF);
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+    vy = wasm_f32x4_mul(vy, vy);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+    vy = wasm_f32x4_mul(vy, vy);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-minmax-scalar-x8.c b/src/f32-vbinary/gen/vsub-minmax-scalar-x8.c
new file mode 100644
index 0000000..2064624
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-minmax-scalar-x8.c
@@ -0,0 +1,103 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_minmax_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+    float vy2 = va2 - vb2;
+    float vy3 = va3 - vb3;
+    float vy4 = va4 - vb4;
+    float vy5 = va5 - vb5;
+    float vy6 = va6 - vb6;
+    float vy7 = va7 - vb7;
+
+
+    vy0 = math_max_f32(vy0, vy_min);
+    vy1 = math_max_f32(vy1, vy_min);
+    vy2 = math_max_f32(vy2, vy_min);
+    vy3 = math_max_f32(vy3, vy_min);
+    vy4 = math_max_f32(vy4, vy_min);
+    vy5 = math_max_f32(vy5, vy_min);
+    vy6 = math_max_f32(vy6, vy_min);
+    vy7 = math_max_f32(vy7, vy_min);
+
+    vy0 = math_min_f32(vy0, vy_max);
+    vy1 = math_min_f32(vy1, vy_max);
+    vy2 = math_min_f32(vy2, vy_max);
+    vy3 = math_min_f32(vy3, vy_max);
+    vy4 = math_min_f32(vy4, vy_max);
+    vy5 = math_min_f32(vy5, vy_max);
+    vy6 = math_min_f32(vy6, vy_max);
+    vy7 = math_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va - vb;
+      vy = math_max_f32(vy, vy_min);
+      vy = math_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-minmax-wasm-x8.c b/src/f32-vbinary/gen/vsub-minmax-wasm-x8.c
new file mode 100644
index 0000000..fb73f2f
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-minmax-wasm-x8.c
@@ -0,0 +1,103 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_minmax_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+    float vy2 = va2 - vb2;
+    float vy3 = va3 - vb3;
+    float vy4 = va4 - vb4;
+    float vy5 = va5 - vb5;
+    float vy6 = va6 - vb6;
+    float vy7 = va7 - vb7;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+    vy4 = __builtin_wasm_max_f32(vy4, vy_min);
+    vy5 = __builtin_wasm_max_f32(vy5, vy_min);
+    vy6 = __builtin_wasm_max_f32(vy6, vy_min);
+    vy7 = __builtin_wasm_max_f32(vy7, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+    vy4 = __builtin_wasm_min_f32(vy4, vy_max);
+    vy5 = __builtin_wasm_min_f32(vy5, vy_max);
+    vy6 = __builtin_wasm_min_f32(vy6, vy_max);
+    vy7 = __builtin_wasm_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va - vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-minmax-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vsub-minmax-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..041a4e7
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-minmax-wasmsimd-arm-x16.c
@@ -0,0 +1,102 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_sub(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_sub(vaCDEF, vbCDEF);
+
+
+    vy0123 = wasm_f32x4_max(vy0123, vy_min);
+    vy4567 = wasm_f32x4_max(vy4567, vy_min);
+    vy89AB = wasm_f32x4_max(vy89AB, vy_min);
+    vyCDEF = wasm_f32x4_max(vyCDEF, vy_min);
+
+    vy0123 = wasm_f32x4_min(vy0123, vy_max);
+    vy4567 = wasm_f32x4_min(vy4567, vy_max);
+    vy89AB = wasm_f32x4_min(vy89AB, vy_max);
+    vyCDEF = wasm_f32x4_min(vyCDEF, vy_max);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-minmax-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vsub-minmax-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..10ad9ea
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-minmax-wasmsimd-x86-x16.c
@@ -0,0 +1,115 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_sub(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_sub(vaCDEF, vbCDEF);
+
+
+    const v128_t vltmask0123 = wasm_f32x4_lt(vy0123, vy_min);
+    const v128_t vltmask4567 = wasm_f32x4_lt(vy4567, vy_min);
+    const v128_t vltmask89AB = wasm_f32x4_lt(vy89AB, vy_min);
+    const v128_t vltmaskCDEF = wasm_f32x4_lt(vyCDEF, vy_min);
+
+    const v128_t vngtmask0123 = wasm_f32x4_le(vy0123, vy_max);
+    vy0123 = wasm_v128_bitselect(vy_min, vy0123, vltmask0123);
+    const v128_t vngtmask4567 = wasm_f32x4_le(vy4567, vy_max);
+    vy4567 = wasm_v128_bitselect(vy_min, vy4567, vltmask4567);
+    const v128_t vngtmask89AB = wasm_f32x4_le(vy89AB, vy_max);
+    vy89AB = wasm_v128_bitselect(vy_min, vy89AB, vltmask89AB);
+    const v128_t vngtmaskCDEF = wasm_f32x4_le(vyCDEF, vy_max);
+    vyCDEF = wasm_v128_bitselect(vy_min, vyCDEF, vltmaskCDEF);
+
+    vy0123 = wasm_v128_bitselect(vy0123, vy_max, vngtmask0123);
+    vy4567 = wasm_v128_bitselect(vy4567, vy_max, vngtmask4567);
+    vy89AB = wasm_v128_bitselect(vy89AB, vy_max, vngtmask89AB);
+    vyCDEF = wasm_v128_bitselect(vyCDEF, vy_max, vngtmaskCDEF);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-scalar-x8.c b/src/f32-vbinary/gen/vsub-relu-scalar-x8.c
new file mode 100644
index 0000000..1a16ebc
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-scalar-x8.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+    float vy2 = va2 - vb2;
+    float vy3 = va3 - vb3;
+    float vy4 = va4 - vb4;
+    float vy5 = va5 - vb5;
+    float vy6 = va6 - vb6;
+    float vy7 = va7 - vb7;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+    vy4 = math_max_f32(vy4, 0.0f);
+    vy5 = math_max_f32(vy5, 0.0f);
+    vy6 = math_max_f32(vy6, 0.0f);
+    vy7 = math_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va - vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-wasm-x8.c b/src/f32-vbinary/gen/vsub-relu-wasm-x8.c
new file mode 100644
index 0000000..2a6950f
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-wasm-x8.c
@@ -0,0 +1,91 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+    float vy2 = va2 - vb2;
+    float vy3 = va3 - vb3;
+    float vy4 = va4 - vb4;
+    float vy5 = va5 - vb5;
+    float vy6 = va6 - vb6;
+    float vy7 = va7 - vb7;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+    vy4 = __builtin_wasm_max_f32(vy4, 0.0f);
+    vy5 = __builtin_wasm_max_f32(vy5, 0.0f);
+    vy6 = __builtin_wasm_max_f32(vy6, 0.0f);
+    vy7 = __builtin_wasm_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va - vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-relu-wasmsimd-x16.c b/src/f32-vbinary/gen/vsub-relu-wasmsimd-x16.c
new file mode 100644
index 0000000..d4a6172
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-relu-wasmsimd-x16.c
@@ -0,0 +1,94 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_relu_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_sub(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_sub(vaCDEF, vbCDEF);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+    vy89AB = wasm_i32x4_max(vy89AB, vzero);
+    vyCDEF = wasm_i32x4_max(vyCDEF, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-scalar-x8.c b/src/f32-vbinary/gen/vsub-scalar-x8.c
new file mode 100644
index 0000000..5f14122
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-scalar-x8.c
@@ -0,0 +1,82 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    const float vb4 = b[4];
+    const float vb5 = b[5];
+    const float vb6 = b[6];
+    const float vb7 = b[7];
+    b += 8;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+    float vy2 = va2 - vb2;
+    float vy3 = va3 - vb3;
+    float vy4 = va4 - vb4;
+    float vy5 = va5 - vb5;
+    float vy6 = va6 - vb6;
+    float vy7 = va7 - vb7;
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va - vb;
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsub-wasmsimd-x16.c b/src/f32-vbinary/gen/vsub-wasmsimd-x16.c
new file mode 100644
index 0000000..3d69d16
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-wasmsimd-x16.c
@@ -0,0 +1,87 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    const v128_t vb0123 = wasm_v128_load(b);
+    const v128_t vb4567 = wasm_v128_load(b + 4);
+    const v128_t vb89AB = wasm_v128_load(b + 8);
+    const v128_t vbCDEF = wasm_v128_load(b + 12);
+    b += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb0123);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb4567);
+    v128_t vy89AB = wasm_f32x4_sub(va89AB, vb89AB);
+    v128_t vyCDEF = wasm_f32x4_sub(vaCDEF, vbCDEF);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    const v128_t vb = wasm_v128_load(b);
+    b += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+    const v128_t vb = wasm_v128_load(b);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-minmax-scalar-x8.c b/src/f32-vbinary/gen/vsubc-minmax-scalar-x8.c
new file mode 100644
index 0000000..0445fce
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-minmax-scalar-x8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_minmax_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+    float vy2 = va2 - vb;
+    float vy3 = va3 - vb;
+    float vy4 = va4 - vb;
+    float vy5 = va5 - vb;
+    float vy6 = va6 - vb;
+    float vy7 = va7 - vb;
+
+
+    vy0 = math_max_f32(vy0, vy_min);
+    vy1 = math_max_f32(vy1, vy_min);
+    vy2 = math_max_f32(vy2, vy_min);
+    vy3 = math_max_f32(vy3, vy_min);
+    vy4 = math_max_f32(vy4, vy_min);
+    vy5 = math_max_f32(vy5, vy_min);
+    vy6 = math_max_f32(vy6, vy_min);
+    vy7 = math_max_f32(vy7, vy_min);
+
+    vy0 = math_min_f32(vy0, vy_max);
+    vy1 = math_min_f32(vy1, vy_max);
+    vy2 = math_min_f32(vy2, vy_max);
+    vy3 = math_min_f32(vy3, vy_max);
+    vy4 = math_min_f32(vy4, vy_max);
+    vy5 = math_min_f32(vy5, vy_max);
+    vy6 = math_min_f32(vy6, vy_max);
+    vy7 = math_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va - vb;
+      vy = math_max_f32(vy, vy_min);
+      vy = math_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-minmax-wasm-x8.c b/src/f32-vbinary/gen/vsubc-minmax-wasm-x8.c
new file mode 100644
index 0000000..499607f
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-minmax-wasm-x8.c
@@ -0,0 +1,93 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_minmax_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+    float vy2 = va2 - vb;
+    float vy3 = va3 - vb;
+    float vy4 = va4 - vb;
+    float vy5 = va5 - vb;
+    float vy6 = va6 - vb;
+    float vy7 = va7 - vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+    vy4 = __builtin_wasm_max_f32(vy4, vy_min);
+    vy5 = __builtin_wasm_max_f32(vy5, vy_min);
+    vy6 = __builtin_wasm_max_f32(vy6, vy_min);
+    vy7 = __builtin_wasm_max_f32(vy7, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+    vy4 = __builtin_wasm_min_f32(vy4, vy_max);
+    vy5 = __builtin_wasm_min_f32(vy5, vy_max);
+    vy6 = __builtin_wasm_min_f32(vy6, vy_max);
+    vy7 = __builtin_wasm_min_f32(vy7, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va - vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x16.c b/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x16.c
new file mode 100644
index 0000000..add88b9
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x16.c
@@ -0,0 +1,92 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_sub(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_sub(vaCDEF, vb);
+
+
+    vy0123 = wasm_f32x4_max(vy0123, vy_min);
+    vy4567 = wasm_f32x4_max(vy4567, vy_min);
+    vy89AB = wasm_f32x4_max(vy89AB, vy_min);
+    vyCDEF = wasm_f32x4_max(vyCDEF, vy_min);
+
+    vy0123 = wasm_f32x4_min(vy0123, vy_max);
+    vy4567 = wasm_f32x4_min(vy4567, vy_max);
+    vy89AB = wasm_f32x4_min(vy89AB, vy_max);
+    vyCDEF = wasm_f32x4_min(vyCDEF, vy_max);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_f32x4_max(vy, vy_min);
+    vy = wasm_f32x4_min(vy, vy_max);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x16.c b/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x16.c
new file mode 100644
index 0000000..65f7022
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x16.c
@@ -0,0 +1,105 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vy_min = wasm_v32x4_load_splat(&params->scalar.min);
+  const v128_t vy_max = wasm_v32x4_load_splat(&params->scalar.max);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_sub(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_sub(vaCDEF, vb);
+
+
+    const v128_t vltmask0123 = wasm_f32x4_lt(vy0123, vy_min);
+    const v128_t vltmask4567 = wasm_f32x4_lt(vy4567, vy_min);
+    const v128_t vltmask89AB = wasm_f32x4_lt(vy89AB, vy_min);
+    const v128_t vltmaskCDEF = wasm_f32x4_lt(vyCDEF, vy_min);
+
+    const v128_t vngtmask0123 = wasm_f32x4_le(vy0123, vy_max);
+    vy0123 = wasm_v128_bitselect(vy_min, vy0123, vltmask0123);
+    const v128_t vngtmask4567 = wasm_f32x4_le(vy4567, vy_max);
+    vy4567 = wasm_v128_bitselect(vy_min, vy4567, vltmask4567);
+    const v128_t vngtmask89AB = wasm_f32x4_le(vy89AB, vy_max);
+    vy89AB = wasm_v128_bitselect(vy_min, vy89AB, vltmask89AB);
+    const v128_t vngtmaskCDEF = wasm_f32x4_le(vyCDEF, vy_max);
+    vyCDEF = wasm_v128_bitselect(vy_min, vyCDEF, vltmaskCDEF);
+
+    vy0123 = wasm_v128_bitselect(vy0123, vy_max, vngtmask0123);
+    vy4567 = wasm_v128_bitselect(vy4567, vy_max, vngtmask4567);
+    vy89AB = wasm_v128_bitselect(vy89AB, vy_max, vngtmask89AB);
+    vyCDEF = wasm_v128_bitselect(vyCDEF, vy_max, vngtmaskCDEF);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    const v128_t vltmask = wasm_f32x4_lt(vy, vy_min);
+    const v128_t vngtmask = wasm_f32x4_le(vy, vy_max);
+    vy = wasm_v128_bitselect(vy_min, vy, vltmask);
+    vy = wasm_v128_bitselect(vy, vy_max, vngtmask);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-scalar-x8.c b/src/f32-vbinary/gen/vsubc-relu-scalar-x8.c
new file mode 100644
index 0000000..4af6e27
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-scalar-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+    float vy2 = va2 - vb;
+    float vy3 = va3 - vb;
+    float vy4 = va4 - vb;
+    float vy5 = va5 - vb;
+    float vy6 = va6 - vb;
+    float vy7 = va7 - vb;
+
+
+    vy0 = math_max_f32(vy0, 0.0f);
+    vy1 = math_max_f32(vy1, 0.0f);
+    vy2 = math_max_f32(vy2, 0.0f);
+    vy3 = math_max_f32(vy3, 0.0f);
+    vy4 = math_max_f32(vy4, 0.0f);
+    vy5 = math_max_f32(vy5, 0.0f);
+    vy6 = math_max_f32(vy6, 0.0f);
+    vy7 = math_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va - vb;
+      vy = math_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-wasm-x8.c b/src/f32-vbinary/gen/vsubc-relu-wasm-x8.c
new file mode 100644
index 0000000..06cb95d
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-wasm-x8.c
@@ -0,0 +1,81 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__wasm_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+    float vy2 = va2 - vb;
+    float vy3 = va3 - vb;
+    float vy4 = va4 - vb;
+    float vy5 = va5 - vb;
+    float vy6 = va6 - vb;
+    float vy7 = va7 - vb;
+
+
+    vy0 = __builtin_wasm_max_f32(vy0, 0.0f);
+    vy1 = __builtin_wasm_max_f32(vy1, 0.0f);
+    vy2 = __builtin_wasm_max_f32(vy2, 0.0f);
+    vy3 = __builtin_wasm_max_f32(vy3, 0.0f);
+    vy4 = __builtin_wasm_max_f32(vy4, 0.0f);
+    vy5 = __builtin_wasm_max_f32(vy5, 0.0f);
+    vy6 = __builtin_wasm_max_f32(vy6, 0.0f);
+    vy7 = __builtin_wasm_max_f32(vy7, 0.0f);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va - vb;
+      vy = __builtin_wasm_max_f32(vy, 0.0f);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x16.c b/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x16.c
new file mode 100644
index 0000000..bfffb87
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-relu-wasmsimd-x16.c
@@ -0,0 +1,84 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_relu_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_relu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vzero = wasm_f32x4_splat(0.0f);
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_sub(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_sub(vaCDEF, vb);
+
+
+    vy0123 = wasm_i32x4_max(vy0123, vzero);
+    vy4567 = wasm_i32x4_max(vy4567, vzero);
+    vy89AB = wasm_i32x4_max(vy89AB, vzero);
+    vyCDEF = wasm_i32x4_max(vyCDEF, vzero);
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+    vy = wasm_i32x4_max(vy, vzero);
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-scalar-x8.c b/src/f32-vbinary/gen/vsubc-scalar-x8.c
new file mode 100644
index 0000000..e0a34e8
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-scalar-x8.c
@@ -0,0 +1,72 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_ukernel__scalar_x8(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+
+  const float vb = *b;
+  for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    const float va4 = a[4];
+    const float va5 = a[5];
+    const float va6 = a[6];
+    const float va7 = a[7];
+    a += 8;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+    float vy2 = va2 - vb;
+    float vy3 = va3 - vb;
+    float vy4 = va4 - vb;
+    float vy5 = va5 - vb;
+    float vy6 = va6 - vb;
+    float vy7 = va7 - vb;
+
+
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y[4] = vy4;
+    y[5] = vy5;
+    y[6] = vy6;
+    y[7] = vy7;
+    y += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va - vb;
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}
diff --git a/src/f32-vbinary/gen/vsubc-wasmsimd-x16.c b/src/f32-vbinary/gen/vsubc-wasmsimd-x16.c
new file mode 100644
index 0000000..c491b87
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-wasmsimd-x16.c
@@ -0,0 +1,77 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_ukernel__wasmsimd_x16(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(b != NULL);
+  assert(y != NULL);
+
+  const v128_t vb = wasm_v32x4_load_splat(b);
+  for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
+    const v128_t va0123 = wasm_v128_load(a);
+    const v128_t va4567 = wasm_v128_load(a + 4);
+    const v128_t va89AB = wasm_v128_load(a + 8);
+    const v128_t vaCDEF = wasm_v128_load(a + 12);
+    a += 16;
+
+    v128_t vy0123 = wasm_f32x4_sub(va0123, vb);
+    v128_t vy4567 = wasm_f32x4_sub(va4567, vb);
+    v128_t vy89AB = wasm_f32x4_sub(va89AB, vb);
+    v128_t vyCDEF = wasm_f32x4_sub(vaCDEF, vb);
+
+
+
+    wasm_v128_store(y, vy0123);
+    wasm_v128_store(y + 4, vy4567);
+    wasm_v128_store(y + 8, vy89AB);
+    wasm_v128_store(y + 12, vyCDEF);
+    y += 16;
+  }
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const v128_t va = wasm_v128_load(a);
+    a += 4;
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+
+    wasm_v128_store(y, vy);
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const v128_t va = wasm_v128_load(a);
+
+    v128_t vy = wasm_f32x4_sub(va, vb);
+
+
+    if (n & (2 * sizeof(float))) {
+      *((double*) y) = wasm_f64x2_extract_lane(vy, 0);
+      vy = wasm_v32x4_shuffle(vy, vy, 2, 3, 2, 3);
+      y += 2;
+    }
+    if (n & (1 * sizeof(float))) {
+      *y = wasm_f32x4_extract_lane(vy, 0);
+    }
+  }
+}
diff --git a/src/init.c b/src/init.c
index fb951c2..a2f4bbb 100644
--- a/src/init.c
+++ b/src/init.c
@@ -576,9 +576,9 @@
       xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2;
       xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
       xnn_params.f32.vadd = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x4,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x4,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x4,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__scalar_x8,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__scalar_x8,
         .element_tile = 8,
       };
       xnn_params.f32.vdiv = (struct vbinary_parameters) {
@@ -588,33 +588,33 @@
         .element_tile = 2,
       };
       xnn_params.f32.vmax = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x4,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x4,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x4,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__scalar_x8,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__scalar_x8,
         .element_tile = 8,
       };
       xnn_params.f32.vmin = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x4,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x4,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x4,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__scalar_x8,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__scalar_x8,
         .element_tile = 8,
       };
       xnn_params.f32.vmul = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x4,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x4,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x4,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__scalar_x8,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__scalar_x8,
         .element_tile = 8,
       };
       xnn_params.f32.vsub = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x4,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x4,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x4,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__scalar_x8,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__scalar_x8,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__scalar_x8,
         .element_tile = 8,
       };
       xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x4,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x4,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x4,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
         .element_tile = 8,
       };
       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
@@ -2052,13 +2052,13 @@
     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__wasmsimd_arm;
     if (is_wasm_x86) {
       xnn_params.f32.vadd = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x8,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x8,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x8,
-        .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x8,
-        .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x8,
-        .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x8,
-        .element_tile = 8,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16,
+        .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
+        .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
+        .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
+        .element_tile = 16,
       };
       xnn_params.f32.vdiv = (struct vbinary_parameters) {
         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x4,
@@ -2070,45 +2070,45 @@
         .element_tile = 4,
       };
       xnn_params.f32.vmax = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_x86_x8,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x8,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x8,
-        .element_tile = 8,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_x86_x16,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16,
+        .element_tile = 16,
       };
       xnn_params.f32.vmin = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_x86_x8,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x8,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x8,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_x86_x16,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_x86_x16,
 
-        .element_tile = 8,
+        .element_tile = 16,
       };
       xnn_params.f32.vmul = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x8,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x8,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x8,
-        .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x8,
-        .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x8,
-        .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x8,
-        .element_tile = 8,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16,
+        .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
+        .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
+        .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
+        .element_tile = 16,
       };
       xnn_params.f32.vsub = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x8,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x8,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x8,
-        .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x8,
-        .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x8,
-        .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x8,
-        .element_tile = 8,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16,
+        .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
+        .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
+        .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
+        .element_tile = 16,
       };
     } else {
       xnn_params.f32.vadd = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x8,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x8,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x8,
-        .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x8,
-        .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x8,
-        .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x8,
-        .element_tile = 8,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16,
+        .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasmsimd_x16,
+        .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
+        .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasmsimd_x16,
+        .element_tile = 16,
       };
       xnn_params.f32.vdiv = (struct vbinary_parameters) {
         .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x4,
@@ -2120,41 +2120,41 @@
         .element_tile = 4,
       };
       xnn_params.f32.vmax = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_arm_x8,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x8,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x8,
-        .element_tile = 8,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasmsimd_arm_x16,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16,
+        .element_tile = 16,
       };
       xnn_params.f32.vmin = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_arm_x8,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x8,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x8,
-        .element_tile = 8,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasmsimd_arm_x16,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasmsimd_arm_x16,
+        .element_tile = 16,
       };
       xnn_params.f32.vmul = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x8,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x8,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x8,
-        .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x8,
-        .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x8,
-        .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x8,
-        .element_tile = 8,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16,
+        .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasmsimd_x16,
+        .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
+        .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasmsimd_x16,
+        .element_tile = 16,
       };
       xnn_params.f32.vsub = (struct vbinary_parameters) {
-        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x8,
-        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x8,
-        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x8,
-        .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x8,
-        .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x8,
-        .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x8,
-        .element_tile = 8,
+        .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16,
+        .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16,
+        .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16,
+        .linear.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasmsimd_x16,
+        .linear.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasmsimd_x16,
+        .linear.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasmsimd_x16,
+        .element_tile = 16,
       };
     }
     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
-      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__wasmsimd_x8,
-      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x8,
-      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x8,
-      .element_tile = 8,
+      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__wasmsimd_x16,
+      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
+      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16,
+      .element_tile = 16,
     };
     if (is_wasm_x86) {
       xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
@@ -2430,9 +2430,9 @@
     xnn_params.f32.raddstoreexpminusmax = xnn_f32_raddstoreexpminusmax_ukernel__scalar_p5_x4_acc2;
     xnn_params.f32.rmax = xnn_f32_rmax_ukernel__scalar;
     xnn_params.f32.vadd = (struct vbinary_parameters) {
-      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasm_x4,
-      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x4,
-      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x4,
+      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_minmax_ukernel__wasm_x8,
+      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
+      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_minmax_ukernel__wasm_x8,
       .element_tile = 8,
     };
     xnn_params.f32.vdiv = (struct vbinary_parameters) {
@@ -2442,33 +2442,33 @@
       .element_tile = 2,
     };
     xnn_params.f32.vmax = (struct vbinary_parameters) {
-      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x4,
-      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x4,
-      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x4,
+      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmax_ukernel__wasm_x8,
+      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
+      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmaxc_ukernel__wasm_x8,
       .element_tile = 8,
     };
     xnn_params.f32.vmin = (struct vbinary_parameters) {
-      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x4,
-      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x4,
-      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x4,
+      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmin_ukernel__wasm_x8,
+      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
+      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vminc_ukernel__wasm_x8,
       .element_tile = 8,
     };
     xnn_params.f32.vmul = (struct vbinary_parameters) {
-      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasm_x4,
-      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x4,
-      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x4,
+      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_minmax_ukernel__wasm_x8,
+      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
+      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_minmax_ukernel__wasm_x8,
       .element_tile = 8,
     };
     xnn_params.f32.vsub = (struct vbinary_parameters) {
-      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasm_x4,
-      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasm_x4,
-      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasm_x4,
+      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_minmax_ukernel__wasm_x8,
+      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_minmax_ukernel__wasm_x8,
+      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_minmax_ukernel__wasm_x8,
       .element_tile = 8,
     };
     xnn_params.f32.vsqrdiff = (struct vbinary_parameters) {
-      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x4,
-      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x4,
-      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x4,
+      .minmax.op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiff_ukernel__scalar_x8,
+      .minmax.opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
+      .minmax.ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsqrdiffc_ukernel__scalar_x8,
       .element_tile = 8,
     };
     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
diff --git a/src/xnnpack/vbinary.h b/src/xnnpack/vbinary.h
index cd083b4..161e6ff 100644
--- a/src/xnnpack/vbinary.h
+++ b/src/xnnpack/vbinary.h
@@ -104,14 +104,18 @@
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmax_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__neon_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__neon_x8)
@@ -123,14 +127,18 @@
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmin_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiff_ukernel__neon_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiff_ukernel__neon_x8)
@@ -142,69 +150,91 @@
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiff_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiff_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiff_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiff_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiff_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiff_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiff_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiff_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdiv_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdiv_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdiv_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdiv_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdiv_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdiv_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdiv_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdivc_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdivc_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdivc_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdivc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdivc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdivc_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vdivc_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrdivc_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrdivc_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrdivc_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrdivc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrdivc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrdivc_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrdivc_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__neon_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__neon_x8)
@@ -216,14 +246,18 @@
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vadd_minmax_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__neon_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__neon_x8)
@@ -235,14 +269,18 @@
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdiv_minmax_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__neon_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__neon_x8)
@@ -254,14 +292,18 @@
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmul_minmax_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__neon_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__neon_x8)
@@ -273,50 +315,66 @@
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsub_minmax_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vadd_relu_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vadd_relu_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vadd_relu_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vadd_relu_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vadd_relu_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vadd_relu_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vadd_relu_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vadd_relu_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vadd_relu_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vadd_relu_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vadd_relu_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdiv_relu_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdiv_relu_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdiv_relu_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdiv_relu_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdiv_relu_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdiv_relu_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdiv_relu_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdiv_relu_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdiv_relu_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdiv_relu_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdiv_relu_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmul_relu_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmul_relu_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmul_relu_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmul_relu_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmul_relu_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmul_relu_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmul_relu_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmul_relu_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmul_relu_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmul_relu_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmul_relu_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsub_relu_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsub_relu_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsub_relu_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsub_relu_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsub_relu_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsub_relu_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsub_relu_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsub_relu_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsub_relu_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsub_relu_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsub_relu_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__neon_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__neon_x8)
@@ -328,14 +386,18 @@
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmaxc_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__neon_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__neon_x8)
@@ -347,14 +409,18 @@
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vminc_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiffc_ukernel__neon_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiffc_ukernel__neon_x8)
@@ -366,9 +432,11 @@
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiffc_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiffc_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiffc_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiffc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiffc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiffc_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsqrdiffc_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__neon_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__neon_x8)
@@ -380,14 +448,18 @@
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vaddc_minmax_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__neon_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__neon_x8)
@@ -399,14 +471,18 @@
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vdivc_minmax_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__neon_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__neon_x8)
@@ -418,14 +494,18 @@
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrdivc_minmax_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__neon_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__neon_x8)
@@ -437,14 +517,18 @@
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vmulc_minmax_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__neon_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__neon_x8)
@@ -456,14 +540,18 @@
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vsubc_minmax_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__neon_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__neon_x8)
@@ -475,68 +563,90 @@
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__avx512f_x32)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x4)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x8)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_MINMAX_UKERNEL_FUNCTION(xnn_f32_vrsubc_minmax_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vaddc_relu_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vaddc_relu_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vaddc_relu_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vaddc_relu_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vaddc_relu_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vaddc_relu_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vaddc_relu_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vaddc_relu_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vaddc_relu_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vaddc_relu_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vaddc_relu_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdivc_relu_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdivc_relu_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdivc_relu_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdivc_relu_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdivc_relu_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdivc_relu_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdivc_relu_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdivc_relu_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdivc_relu_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdivc_relu_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vdivc_relu_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrdivc_relu_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrdivc_relu_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrdivc_relu_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrdivc_relu_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrdivc_relu_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrdivc_relu_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrdivc_relu_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrdivc_relu_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrdivc_relu_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrdivc_relu_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrdivc_relu_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmulc_relu_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmulc_relu_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmulc_relu_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmulc_relu_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmulc_relu_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmulc_relu_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmulc_relu_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmulc_relu_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmulc_relu_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmulc_relu_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vmulc_relu_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsubc_relu_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsubc_relu_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsubc_relu_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsubc_relu_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsubc_relu_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsubc_relu_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsubc_relu_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsubc_relu_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsubc_relu_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsubc_relu_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vsubc_relu_ukernel__scalar_x8)
 
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrsubc_relu_ukernel__wasmsimd_x4)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrsubc_relu_ukernel__wasmsimd_x8)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrsubc_relu_ukernel__wasmsimd_x16)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrsubc_relu_ukernel__wasm_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrsubc_relu_ukernel__wasm_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrsubc_relu_ukernel__wasm_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrsubc_relu_ukernel__wasm_x8)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrsubc_relu_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrsubc_relu_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrsubc_relu_ukernel__scalar_x4)
+DECLARE_F32_VBINOP_RELU_UKERNEL_FUNCTION(xnn_f32_vrsubc_relu_ukernel__scalar_x8)
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/test/f32-vadd-relu.cc b/test/f32-vadd-relu.cc
index cbc11a8..03ab44a 100644
--- a/test/f32-vadd-relu.cc
+++ b/test/f32-vadd-relu.cc
@@ -139,6 +139,67 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VADD_RELU__WASMSIMD_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vadd_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+  }
+
+  TEST(F32_VADD_RELU__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vadd_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD_RELU__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vadd_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD_RELU__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vadd_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD_RELU__WASMSIMD_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vadd_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD_RELU__WASMSIMD_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vadd_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD_RELU__WASMSIMD_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vadd_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VADD_RELU__WASM_X1, batch_eq_1) {
     VBinOpMicrokernelTester()
@@ -306,6 +367,67 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VADD_RELU__WASM_X8, batch_eq_8) {
+    VBinOpMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vadd_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Add);
+  }
+
+  TEST(F32_VADD_RELU__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vadd_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD_RELU__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vadd_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD_RELU__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vadd_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD_RELU__WASM_X8, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vadd_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD_RELU__WASM_X8, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vadd_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD_RELU__WASM_X8, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vadd_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VADD_RELU__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -465,3 +587,62 @@
       .Test(xnn_f32_vadd_relu_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VADD_RELU__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vadd_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VADD_RELU__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vadd_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADD_RELU__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vadd_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADD_RELU__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vadd_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADD_RELU__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vadd_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADD_RELU__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vadd_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADD_RELU__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vadd_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vadd-relu.yaml b/test/f32-vadd-relu.yaml
index 202f31f..e363d0d 100644
--- a/test/f32-vadd-relu.yaml
+++ b/test/f32-vadd-relu.yaml
@@ -4,9 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vadd_relu_ukernel__wasmsimd_x4
 - name: xnn_f32_vadd_relu_ukernel__wasmsimd_x8
+- name: xnn_f32_vadd_relu_ukernel__wasmsimd_x16
 - name: xnn_f32_vadd_relu_ukernel__wasm_x1
 - name: xnn_f32_vadd_relu_ukernel__wasm_x2
 - name: xnn_f32_vadd_relu_ukernel__wasm_x4
+- name: xnn_f32_vadd_relu_ukernel__wasm_x8
 - name: xnn_f32_vadd_relu_ukernel__scalar_x1
 - name: xnn_f32_vadd_relu_ukernel__scalar_x2
 - name: xnn_f32_vadd_relu_ukernel__scalar_x4
+- name: xnn_f32_vadd_relu_ukernel__scalar_x8
diff --git a/test/f32-vadd.cc b/test/f32-vadd.cc
index a16a9b7..cec1f6f 100644
--- a/test/f32-vadd.cc
+++ b/test/f32-vadd.cc
@@ -139,6 +139,67 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VADD__WASMSIMD_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vadd_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+  }
+
+  TEST(F32_VADD__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vadd_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vadd_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vadd_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD__WASMSIMD_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vadd_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD__WASMSIMD_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vadd_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+
+  TEST(F32_VADD__WASMSIMD_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vadd_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Add);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VADD__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -298,3 +359,62 @@
       .Test(xnn_f32_vadd_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VADD__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vadd_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VADD__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vadd_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADD__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vadd_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADD__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vadd_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADD__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vadd_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADD__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vadd_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADD__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vadd_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Add, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vadd.yaml b/test/f32-vadd.yaml
index 5c0018d..ce4f3c5 100644
--- a/test/f32-vadd.yaml
+++ b/test/f32-vadd.yaml
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vadd_ukernel__wasmsimd_x4
 - name: xnn_f32_vadd_ukernel__wasmsimd_x8
+- name: xnn_f32_vadd_ukernel__wasmsimd_x16
 - name: xnn_f32_vadd_ukernel__scalar_x1
 - name: xnn_f32_vadd_ukernel__scalar_x2
 - name: xnn_f32_vadd_ukernel__scalar_x4
+- name: xnn_f32_vadd_ukernel__scalar_x8
diff --git a/test/f32-vaddc-minmax.cc b/test/f32-vaddc-minmax.cc
index e506cc1..170a414 100644
--- a/test/f32-vaddc-minmax.cc
+++ b/test/f32-vaddc-minmax.cc
@@ -674,6 +674,66 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VADDC_MINMAX__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::AddC);
+  }
+
+  TEST(F32_VADDC_MINMAX__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASMSIMD_ARM_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASMSIMD_ARM_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASMSIMD_ARM_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VADDC_MINMAX__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpCMicrokernelTester()
       .batch_size(4)
@@ -793,6 +853,66 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VADDC_MINMAX__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::AddC);
+  }
+
+  TEST(F32_VADDC_MINMAX__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASMSIMD_X86_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASMSIMD_X86_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASMSIMD_X86_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VADDC_MINMAX__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -957,6 +1077,66 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VADDC_MINMAX__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vaddc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::AddC);
+  }
+
+  TEST(F32_VADDC_MINMAX__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASM_X8, qmin) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_MINMAX__WASM_X8, qmax) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vaddc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VADDC_MINMAX__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -1110,4 +1290,61 @@
       .qmax(128)
       .Test(xnn_f32_vaddc_minmax_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
+}
+
+TEST(F32_VADDC_MINMAX__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vaddc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VADDC_MINMAX__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vaddc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADDC_MINMAX__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vaddc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADDC_MINMAX__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vaddc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADDC_MINMAX__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vaddc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADDC_MINMAX__SCALAR_X8, qmin) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .qmin(128)
+      .Test(xnn_f32_vaddc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADDC_MINMAX__SCALAR_X8, qmax) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .qmax(128)
+      .Test(xnn_f32_vaddc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
 }
\ No newline at end of file
diff --git a/test/f32-vaddc-minmax.yaml b/test/f32-vaddc-minmax.yaml
index f1abccf..d92358b 100644
--- a/test/f32-vaddc-minmax.yaml
+++ b/test/f32-vaddc-minmax.yaml
@@ -12,11 +12,15 @@
 - name: xnn_f32_vaddc_minmax_ukernel__avx512f_x32
 - name: xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vaddc_minmax_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vaddc_minmax_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vaddc_minmax_ukernel__wasm_x1
 - name: xnn_f32_vaddc_minmax_ukernel__wasm_x2
 - name: xnn_f32_vaddc_minmax_ukernel__wasm_x4
+- name: xnn_f32_vaddc_minmax_ukernel__wasm_x8
 - name: xnn_f32_vaddc_minmax_ukernel__scalar_x1
 - name: xnn_f32_vaddc_minmax_ukernel__scalar_x2
 - name: xnn_f32_vaddc_minmax_ukernel__scalar_x4
+- name: xnn_f32_vaddc_minmax_ukernel__scalar_x8
diff --git a/test/f32-vaddc-relu.cc b/test/f32-vaddc-relu.cc
index 0c7ec74..bf59745 100644
--- a/test/f32-vaddc-relu.cc
+++ b/test/f32-vaddc-relu.cc
@@ -101,6 +101,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VADDC_RELU__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vaddc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::AddC);
+  }
+
+  TEST(F32_VADDC_RELU__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_RELU__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_RELU__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_RELU__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vaddc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VADDC_RELU__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -211,6 +253,48 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VADDC_RELU__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vaddc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::AddC);
+  }
+
+  TEST(F32_VADDC_RELU__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_RELU__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_RELU__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC_RELU__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vaddc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VADDC_RELU__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -313,3 +397,43 @@
       .Test(xnn_f32_vaddc_relu_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VADDC_RELU__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vaddc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VADDC_RELU__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vaddc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADDC_RELU__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vaddc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADDC_RELU__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vaddc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADDC_RELU__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vaddc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vaddc-relu.yaml b/test/f32-vaddc-relu.yaml
index 67b9b6e..b9b3911 100644
--- a/test/f32-vaddc-relu.yaml
+++ b/test/f32-vaddc-relu.yaml
@@ -4,9 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vaddc_relu_ukernel__wasmsimd_x4
 - name: xnn_f32_vaddc_relu_ukernel__wasmsimd_x8
+- name: xnn_f32_vaddc_relu_ukernel__wasmsimd_x16
 - name: xnn_f32_vaddc_relu_ukernel__wasm_x1
 - name: xnn_f32_vaddc_relu_ukernel__wasm_x2
 - name: xnn_f32_vaddc_relu_ukernel__wasm_x4
+- name: xnn_f32_vaddc_relu_ukernel__wasm_x8
 - name: xnn_f32_vaddc_relu_ukernel__scalar_x1
 - name: xnn_f32_vaddc_relu_ukernel__scalar_x2
 - name: xnn_f32_vaddc_relu_ukernel__scalar_x4
+- name: xnn_f32_vaddc_relu_ukernel__scalar_x8
diff --git a/test/f32-vaddc.cc b/test/f32-vaddc.cc
index f8ce677..96d2262 100644
--- a/test/f32-vaddc.cc
+++ b/test/f32-vaddc.cc
@@ -101,6 +101,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VADDC__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vaddc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::AddC);
+  }
+
+  TEST(F32_VADDC__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vaddc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+
+  TEST(F32_VADDC__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vaddc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::AddC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VADDC__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -203,3 +245,43 @@
       .Test(xnn_f32_vaddc_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VADDC__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vaddc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VADDC__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vaddc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADDC__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vaddc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADDC__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vaddc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VADDC__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vaddc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::AddC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vaddc.yaml b/test/f32-vaddc.yaml
index 503ccd1..975c839 100644
--- a/test/f32-vaddc.yaml
+++ b/test/f32-vaddc.yaml
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vaddc_ukernel__wasmsimd_x4
 - name: xnn_f32_vaddc_ukernel__wasmsimd_x8
+- name: xnn_f32_vaddc_ukernel__wasmsimd_x16
 - name: xnn_f32_vaddc_ukernel__scalar_x1
 - name: xnn_f32_vaddc_ukernel__scalar_x2
 - name: xnn_f32_vaddc_ukernel__scalar_x4
+- name: xnn_f32_vaddc_ukernel__scalar_x8
diff --git a/test/f32-vdiv-minmax.cc b/test/f32-vdiv-minmax.cc
index 858a636..7777685 100644
--- a/test/f32-vdiv-minmax.cc
+++ b/test/f32-vdiv-minmax.cc
@@ -880,6 +880,85 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VDIV_MINMAX__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Div);
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_ARM_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_ARM_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_ARM_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_ARM_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_ARM_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VDIV_MINMAX__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpMicrokernelTester()
       .batch_size(4)
@@ -1037,6 +1116,85 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VDIV_MINMAX__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Div);
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_X86_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_X86_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_X86_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_X86_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASMSIMD_X86_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VDIV_MINMAX__WASM_X1, batch_eq_1) {
     VBinOpMicrokernelTester()
@@ -1258,6 +1416,85 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VDIV_MINMAX__WASM_X8, batch_eq_8) {
+    VBinOpMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vdiv_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+  }
+
+  TEST(F32_VDIV_MINMAX__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASM_X8, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASM_X8, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASM_X8, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASM_X8, qmin) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_MINMAX__WASM_X8, qmax) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vdiv_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VDIV_MINMAX__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -1468,4 +1705,80 @@
       .qmax(128)
       .Test(xnn_f32_vdiv_minmax_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
   }
+}
+
+TEST(F32_VDIV_MINMAX__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vdiv_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VDIV_MINMAX__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdiv_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV_MINMAX__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdiv_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV_MINMAX__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdiv_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV_MINMAX__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vdiv_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV_MINMAX__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vdiv_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV_MINMAX__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vdiv_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV_MINMAX__SCALAR_X8, qmin) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .qmin(128)
+      .Test(xnn_f32_vdiv_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV_MINMAX__SCALAR_X8, qmax) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .qmax(128)
+      .Test(xnn_f32_vdiv_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
 }
\ No newline at end of file
diff --git a/test/f32-vdiv-minmax.yaml b/test/f32-vdiv-minmax.yaml
index 82ecb37..f030359 100644
--- a/test/f32-vdiv-minmax.yaml
+++ b/test/f32-vdiv-minmax.yaml
@@ -16,11 +16,15 @@
 - name: xnn_f32_vdiv_minmax_ukernel__avx512f_x32
 - name: xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vdiv_minmax_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vdiv_minmax_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vdiv_minmax_ukernel__wasm_x1
 - name: xnn_f32_vdiv_minmax_ukernel__wasm_x2
 - name: xnn_f32_vdiv_minmax_ukernel__wasm_x4
+- name: xnn_f32_vdiv_minmax_ukernel__wasm_x8
 - name: xnn_f32_vdiv_minmax_ukernel__scalar_x1
 - name: xnn_f32_vdiv_minmax_ukernel__scalar_x2
 - name: xnn_f32_vdiv_minmax_ukernel__scalar_x4
+- name: xnn_f32_vdiv_minmax_ukernel__scalar_x8
diff --git a/test/f32-vdiv-relu.cc b/test/f32-vdiv-relu.cc
index 63b74c9..52e082a 100644
--- a/test/f32-vdiv-relu.cc
+++ b/test/f32-vdiv-relu.cc
@@ -139,6 +139,67 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VDIV_RELU__WASMSIMD_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vdiv_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+  }
+
+  TEST(F32_VDIV_RELU__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_RELU__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_RELU__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_RELU__WASMSIMD_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vdiv_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_RELU__WASMSIMD_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vdiv_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_RELU__WASMSIMD_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vdiv_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VDIV_RELU__WASM_X1, batch_eq_1) {
     VBinOpMicrokernelTester()
@@ -306,6 +367,67 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VDIV_RELU__WASM_X8, batch_eq_8) {
+    VBinOpMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vdiv_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+  }
+
+  TEST(F32_VDIV_RELU__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_RELU__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_RELU__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_RELU__WASM_X8, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vdiv_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_RELU__WASM_X8, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vdiv_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV_RELU__WASM_X8, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vdiv_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VDIV_RELU__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -465,3 +587,62 @@
       .Test(xnn_f32_vdiv_relu_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VDIV_RELU__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vdiv_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VDIV_RELU__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdiv_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV_RELU__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdiv_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV_RELU__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdiv_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV_RELU__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vdiv_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV_RELU__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vdiv_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV_RELU__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vdiv_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vdiv-relu.yaml b/test/f32-vdiv-relu.yaml
index da369d9..cd0b518 100644
--- a/test/f32-vdiv-relu.yaml
+++ b/test/f32-vdiv-relu.yaml
@@ -4,9 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vdiv_relu_ukernel__wasmsimd_x4
 - name: xnn_f32_vdiv_relu_ukernel__wasmsimd_x8
+- name: xnn_f32_vdiv_relu_ukernel__wasmsimd_x16
 - name: xnn_f32_vdiv_relu_ukernel__wasm_x1
 - name: xnn_f32_vdiv_relu_ukernel__wasm_x2
 - name: xnn_f32_vdiv_relu_ukernel__wasm_x4
+- name: xnn_f32_vdiv_relu_ukernel__wasm_x8
 - name: xnn_f32_vdiv_relu_ukernel__scalar_x1
 - name: xnn_f32_vdiv_relu_ukernel__scalar_x2
 - name: xnn_f32_vdiv_relu_ukernel__scalar_x4
+- name: xnn_f32_vdiv_relu_ukernel__scalar_x8
diff --git a/test/f32-vdiv.cc b/test/f32-vdiv.cc
index f311cb5..e20b8ca 100644
--- a/test/f32-vdiv.cc
+++ b/test/f32-vdiv.cc
@@ -139,6 +139,67 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VDIV__WASMSIMD_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vdiv_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+  }
+
+  TEST(F32_VDIV__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdiv_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV__WASMSIMD_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vdiv_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV__WASMSIMD_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vdiv_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+
+  TEST(F32_VDIV__WASMSIMD_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vdiv_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Div);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VDIV__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -298,3 +359,62 @@
       .Test(xnn_f32_vdiv_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VDIV__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vdiv_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VDIV__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdiv_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdiv_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdiv_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vdiv_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vdiv_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIV__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vdiv_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Div, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vdiv.yaml b/test/f32-vdiv.yaml
index a303ad7..e055a6a 100644
--- a/test/f32-vdiv.yaml
+++ b/test/f32-vdiv.yaml
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vdiv_ukernel__wasmsimd_x4
 - name: xnn_f32_vdiv_ukernel__wasmsimd_x8
+- name: xnn_f32_vdiv_ukernel__wasmsimd_x16
 - name: xnn_f32_vdiv_ukernel__scalar_x1
 - name: xnn_f32_vdiv_ukernel__scalar_x2
 - name: xnn_f32_vdiv_ukernel__scalar_x4
+- name: xnn_f32_vdiv_ukernel__scalar_x8
diff --git a/test/f32-vdivc-minmax.cc b/test/f32-vdivc-minmax.cc
index eb345aa..2431158 100644
--- a/test/f32-vdivc-minmax.cc
+++ b/test/f32-vdivc-minmax.cc
@@ -674,6 +674,66 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::DivC);
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_ARM_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_ARM_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_ARM_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VDIVC_MINMAX__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpCMicrokernelTester()
       .batch_size(4)
@@ -793,6 +853,66 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::DivC);
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_X86_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_X86_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASMSIMD_X86_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VDIVC_MINMAX__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -957,6 +1077,66 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VDIVC_MINMAX__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::DivC);
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASM_X8, qmin) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_MINMAX__WASM_X8, qmax) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VDIVC_MINMAX__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -1110,4 +1290,61 @@
       .qmax(128)
       .Test(xnn_f32_vdivc_minmax_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
+}
+
+TEST(F32_VDIVC_MINMAX__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VDIVC_MINMAX__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIVC_MINMAX__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIVC_MINMAX__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIVC_MINMAX__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIVC_MINMAX__SCALAR_X8, qmin) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .qmin(128)
+      .Test(xnn_f32_vdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIVC_MINMAX__SCALAR_X8, qmax) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .qmax(128)
+      .Test(xnn_f32_vdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
 }
\ No newline at end of file
diff --git a/test/f32-vdivc-minmax.yaml b/test/f32-vdivc-minmax.yaml
index 229afc7..ea7fa93 100644
--- a/test/f32-vdivc-minmax.yaml
+++ b/test/f32-vdivc-minmax.yaml
@@ -16,11 +16,15 @@
 - name: xnn_f32_vdivc_minmax_ukernel__avx512f_x32
 - name: xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vdivc_minmax_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vdivc_minmax_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vdivc_minmax_ukernel__wasm_x1
 - name: xnn_f32_vdivc_minmax_ukernel__wasm_x2
 - name: xnn_f32_vdivc_minmax_ukernel__wasm_x4
+- name: xnn_f32_vdivc_minmax_ukernel__wasm_x8
 - name: xnn_f32_vdivc_minmax_ukernel__scalar_x1
 - name: xnn_f32_vdivc_minmax_ukernel__scalar_x2
 - name: xnn_f32_vdivc_minmax_ukernel__scalar_x4
+- name: xnn_f32_vdivc_minmax_ukernel__scalar_x8
diff --git a/test/f32-vdivc-relu.cc b/test/f32-vdivc-relu.cc
index 40da21a..b89f98a 100644
--- a/test/f32-vdivc-relu.cc
+++ b/test/f32-vdivc-relu.cc
@@ -101,6 +101,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VDIVC_RELU__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vdivc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::DivC);
+  }
+
+  TEST(F32_VDIVC_RELU__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_RELU__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_RELU__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_RELU__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vdivc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VDIVC_RELU__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -211,6 +253,48 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VDIVC_RELU__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vdivc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::DivC);
+  }
+
+  TEST(F32_VDIVC_RELU__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_RELU__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_RELU__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC_RELU__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vdivc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VDIVC_RELU__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -313,3 +397,43 @@
       .Test(xnn_f32_vdivc_relu_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VDIVC_RELU__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vdivc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VDIVC_RELU__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdivc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIVC_RELU__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdivc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIVC_RELU__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdivc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIVC_RELU__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vdivc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vdivc-relu.yaml b/test/f32-vdivc-relu.yaml
index d976958..d64fad7 100644
--- a/test/f32-vdivc-relu.yaml
+++ b/test/f32-vdivc-relu.yaml
@@ -4,9 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vdivc_relu_ukernel__wasmsimd_x4
 - name: xnn_f32_vdivc_relu_ukernel__wasmsimd_x8
+- name: xnn_f32_vdivc_relu_ukernel__wasmsimd_x16
 - name: xnn_f32_vdivc_relu_ukernel__wasm_x1
 - name: xnn_f32_vdivc_relu_ukernel__wasm_x2
 - name: xnn_f32_vdivc_relu_ukernel__wasm_x4
+- name: xnn_f32_vdivc_relu_ukernel__wasm_x8
 - name: xnn_f32_vdivc_relu_ukernel__scalar_x1
 - name: xnn_f32_vdivc_relu_ukernel__scalar_x2
 - name: xnn_f32_vdivc_relu_ukernel__scalar_x4
+- name: xnn_f32_vdivc_relu_ukernel__scalar_x8
diff --git a/test/f32-vdivc.cc b/test/f32-vdivc.cc
index cd8a917..b8493e6 100644
--- a/test/f32-vdivc.cc
+++ b/test/f32-vdivc.cc
@@ -101,6 +101,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VDIVC__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vdivc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::DivC);
+  }
+
+  TEST(F32_VDIVC__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vdivc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+
+  TEST(F32_VDIVC__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vdivc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::DivC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VDIVC__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -203,3 +245,43 @@
       .Test(xnn_f32_vdivc_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VDIVC__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vdivc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VDIVC__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdivc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIVC__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdivc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIVC__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vdivc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VDIVC__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vdivc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::DivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vdivc.yaml b/test/f32-vdivc.yaml
index c6c01c4..490750f 100644
--- a/test/f32-vdivc.yaml
+++ b/test/f32-vdivc.yaml
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vdivc_ukernel__wasmsimd_x4
 - name: xnn_f32_vdivc_ukernel__wasmsimd_x8
+- name: xnn_f32_vdivc_ukernel__wasmsimd_x16
 - name: xnn_f32_vdivc_ukernel__scalar_x1
 - name: xnn_f32_vdivc_ukernel__scalar_x2
 - name: xnn_f32_vdivc_ukernel__scalar_x4
+- name: xnn_f32_vdivc_ukernel__scalar_x8
diff --git a/test/f32-vmax.cc b/test/f32-vmax.cc
index 3dbe60a..6f0ea5c 100644
--- a/test/f32-vmax.cc
+++ b/test/f32-vmax.cc
@@ -684,6 +684,67 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VMAX__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Max);
+  }
+
+  TEST(F32_VMAX__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASMSIMD_ARM_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASMSIMD_ARM_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASMSIMD_ARM_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VMAX__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpMicrokernelTester()
       .batch_size(4)
@@ -805,6 +866,67 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMAX__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Max);
+  }
+
+  TEST(F32_VMAX__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASMSIMD_X86_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASMSIMD_X86_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASMSIMD_X86_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VMAX__WASM_X1, batch_eq_1) {
     VBinOpMicrokernelTester()
@@ -972,6 +1094,67 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VMAX__WASM_X8, batch_eq_8) {
+    VBinOpMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Max);
+  }
+
+  TEST(F32_VMAX__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASM_X8, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASM_X8, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+
+  TEST(F32_VMAX__WASM_X8, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Max);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VMAX__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -1131,3 +1314,62 @@
       .Test(xnn_f32_vmax_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Max, VBinOpMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VMAX__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Max, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VMAX__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Max, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMAX__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Max, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMAX__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Max, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMAX__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Max, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMAX__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Max, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMAX__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Max, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vmax.yaml b/test/f32-vmax.yaml
index e947dce..e28ecc0 100644
--- a/test/f32-vmax.yaml
+++ b/test/f32-vmax.yaml
@@ -12,11 +12,15 @@
 - name: xnn_f32_vmax_ukernel__avx512f_x32
 - name: xnn_f32_vmax_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vmax_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vmax_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vmax_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vmax_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vmax_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vmax_ukernel__wasm_x1
 - name: xnn_f32_vmax_ukernel__wasm_x2
 - name: xnn_f32_vmax_ukernel__wasm_x4
+- name: xnn_f32_vmax_ukernel__wasm_x8
 - name: xnn_f32_vmax_ukernel__scalar_x1
 - name: xnn_f32_vmax_ukernel__scalar_x2
 - name: xnn_f32_vmax_ukernel__scalar_x4
+- name: xnn_f32_vmax_ukernel__scalar_x8
diff --git a/test/f32-vmaxc.cc b/test/f32-vmaxc.cc
index 531646c..3cd1dd8 100644
--- a/test/f32-vmaxc.cc
+++ b/test/f32-vmaxc.cc
@@ -478,6 +478,48 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VMAXC__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MaxC);
+  }
+
+  TEST(F32_VMAXC__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MaxC);
+    }
+  }
+
+  TEST(F32_VMAXC__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MaxC);
+    }
+  }
+
+  TEST(F32_VMAXC__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MaxC);
+    }
+  }
+
+  TEST(F32_VMAXC__WASMSIMD_ARM_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MaxC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VMAXC__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpCMicrokernelTester()
       .batch_size(4)
@@ -561,6 +603,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMAXC__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MaxC);
+  }
+
+  TEST(F32_VMAXC__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MaxC);
+    }
+  }
+
+  TEST(F32_VMAXC__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MaxC);
+    }
+  }
+
+  TEST(F32_VMAXC__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MaxC);
+    }
+  }
+
+  TEST(F32_VMAXC__WASMSIMD_X86_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MaxC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VMAXC__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -671,6 +755,48 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VMAXC__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vmaxc_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MaxC);
+  }
+
+  TEST(F32_VMAXC__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmaxc_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MaxC);
+    }
+  }
+
+  TEST(F32_VMAXC__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmaxc_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MaxC);
+    }
+  }
+
+  TEST(F32_VMAXC__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmaxc_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MaxC);
+    }
+  }
+
+  TEST(F32_VMAXC__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vmaxc_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MaxC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VMAXC__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -773,3 +899,43 @@
       .Test(xnn_f32_vmaxc_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::MaxC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VMAXC__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vmaxc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MaxC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VMAXC__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmaxc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MaxC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMAXC__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmaxc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MaxC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMAXC__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmaxc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MaxC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMAXC__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vmaxc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MaxC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vmaxc.yaml b/test/f32-vmaxc.yaml
index 43859f7..824f25b 100644
--- a/test/f32-vmaxc.yaml
+++ b/test/f32-vmaxc.yaml
@@ -12,11 +12,15 @@
 - name: xnn_f32_vmaxc_ukernel__avx512f_x32
 - name: xnn_f32_vmaxc_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vmaxc_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vmaxc_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vmaxc_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vmaxc_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vmaxc_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vmaxc_ukernel__wasm_x1
 - name: xnn_f32_vmaxc_ukernel__wasm_x2
 - name: xnn_f32_vmaxc_ukernel__wasm_x4
+- name: xnn_f32_vmaxc_ukernel__wasm_x8
 - name: xnn_f32_vmaxc_ukernel__scalar_x1
 - name: xnn_f32_vmaxc_ukernel__scalar_x2
 - name: xnn_f32_vmaxc_ukernel__scalar_x4
+- name: xnn_f32_vmaxc_ukernel__scalar_x8
diff --git a/test/f32-vmin.cc b/test/f32-vmin.cc
index ed3a389..8de0a17 100644
--- a/test/f32-vmin.cc
+++ b/test/f32-vmin.cc
@@ -684,6 +684,67 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VMIN__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmin_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Min);
+  }
+
+  TEST(F32_VMIN__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmin_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmin_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmin_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASMSIMD_ARM_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vmin_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASMSIMD_ARM_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vmin_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASMSIMD_ARM_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vmin_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VMIN__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpMicrokernelTester()
       .batch_size(4)
@@ -805,6 +866,67 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMIN__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmin_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Min);
+  }
+
+  TEST(F32_VMIN__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmin_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmin_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmin_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASMSIMD_X86_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vmin_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASMSIMD_X86_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vmin_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASMSIMD_X86_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vmin_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VMIN__WASM_X1, batch_eq_1) {
     VBinOpMicrokernelTester()
@@ -972,6 +1094,67 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VMIN__WASM_X8, batch_eq_8) {
+    VBinOpMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vmin_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Min);
+  }
+
+  TEST(F32_VMIN__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmin_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmin_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmin_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASM_X8, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vmin_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASM_X8, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vmin_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+
+  TEST(F32_VMIN__WASM_X8, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vmin_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Min);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VMIN__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -1131,3 +1314,62 @@
       .Test(xnn_f32_vmin_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Min, VBinOpMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VMIN__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vmin_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Min, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VMIN__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmin_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Min, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMIN__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmin_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Min, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMIN__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmin_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Min, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMIN__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vmin_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Min, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMIN__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vmin_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Min, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMIN__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vmin_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Min, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vmin.yaml b/test/f32-vmin.yaml
index 75642bc..391c85c 100644
--- a/test/f32-vmin.yaml
+++ b/test/f32-vmin.yaml
@@ -12,11 +12,15 @@
 - name: xnn_f32_vmin_ukernel__avx512f_x32
 - name: xnn_f32_vmin_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vmin_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vmin_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vmin_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vmin_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vmin_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vmin_ukernel__wasm_x1
 - name: xnn_f32_vmin_ukernel__wasm_x2
 - name: xnn_f32_vmin_ukernel__wasm_x4
+- name: xnn_f32_vmin_ukernel__wasm_x8
 - name: xnn_f32_vmin_ukernel__scalar_x1
 - name: xnn_f32_vmin_ukernel__scalar_x2
 - name: xnn_f32_vmin_ukernel__scalar_x4
+- name: xnn_f32_vmin_ukernel__scalar_x8
diff --git a/test/f32-vminc.cc b/test/f32-vminc.cc
index 99696b1..5c7c3b2 100644
--- a/test/f32-vminc.cc
+++ b/test/f32-vminc.cc
@@ -478,6 +478,48 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VMINC__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vminc_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MinC);
+  }
+
+  TEST(F32_VMINC__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vminc_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MinC);
+    }
+  }
+
+  TEST(F32_VMINC__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vminc_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MinC);
+    }
+  }
+
+  TEST(F32_VMINC__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vminc_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MinC);
+    }
+  }
+
+  TEST(F32_VMINC__WASMSIMD_ARM_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vminc_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MinC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VMINC__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpCMicrokernelTester()
       .batch_size(4)
@@ -561,6 +603,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMINC__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vminc_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MinC);
+  }
+
+  TEST(F32_VMINC__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vminc_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MinC);
+    }
+  }
+
+  TEST(F32_VMINC__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vminc_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MinC);
+    }
+  }
+
+  TEST(F32_VMINC__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vminc_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MinC);
+    }
+  }
+
+  TEST(F32_VMINC__WASMSIMD_X86_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vminc_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MinC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VMINC__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -671,6 +755,48 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VMINC__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vminc_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MinC);
+  }
+
+  TEST(F32_VMINC__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vminc_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MinC);
+    }
+  }
+
+  TEST(F32_VMINC__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vminc_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MinC);
+    }
+  }
+
+  TEST(F32_VMINC__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vminc_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MinC);
+    }
+  }
+
+  TEST(F32_VMINC__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vminc_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MinC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VMINC__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -773,3 +899,43 @@
       .Test(xnn_f32_vminc_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::MinC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VMINC__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vminc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MinC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VMINC__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vminc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MinC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMINC__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vminc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MinC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMINC__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vminc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MinC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMINC__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vminc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MinC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vminc.yaml b/test/f32-vminc.yaml
index 3e5c065..bcd7e0f 100644
--- a/test/f32-vminc.yaml
+++ b/test/f32-vminc.yaml
@@ -12,11 +12,15 @@
 - name: xnn_f32_vminc_ukernel__avx512f_x32
 - name: xnn_f32_vminc_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vminc_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vminc_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vminc_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vminc_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vminc_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vminc_ukernel__wasm_x1
 - name: xnn_f32_vminc_ukernel__wasm_x2
 - name: xnn_f32_vminc_ukernel__wasm_x4
+- name: xnn_f32_vminc_ukernel__wasm_x8
 - name: xnn_f32_vminc_ukernel__scalar_x1
 - name: xnn_f32_vminc_ukernel__scalar_x2
 - name: xnn_f32_vminc_ukernel__scalar_x4
+- name: xnn_f32_vminc_ukernel__scalar_x8
diff --git a/test/f32-vmul-minmax.cc b/test/f32-vmul-minmax.cc
index 45ea7eb..da7a489 100644
--- a/test/f32-vmul-minmax.cc
+++ b/test/f32-vmul-minmax.cc
@@ -880,6 +880,85 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VMUL_MINMAX__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Mul);
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_ARM_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_ARM_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_ARM_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_ARM_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_ARM_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VMUL_MINMAX__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpMicrokernelTester()
       .batch_size(4)
@@ -1037,6 +1116,85 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMUL_MINMAX__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Mul);
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_X86_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_X86_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_X86_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_X86_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASMSIMD_X86_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VMUL_MINMAX__WASM_X1, batch_eq_1) {
     VBinOpMicrokernelTester()
@@ -1258,6 +1416,85 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VMUL_MINMAX__WASM_X8, batch_eq_8) {
+    VBinOpMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vmul_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+  }
+
+  TEST(F32_VMUL_MINMAX__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASM_X8, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASM_X8, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASM_X8, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASM_X8, qmin) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_MINMAX__WASM_X8, qmax) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vmul_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VMUL_MINMAX__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -1468,4 +1705,80 @@
       .qmax(128)
       .Test(xnn_f32_vmul_minmax_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
   }
+}
+
+TEST(F32_VMUL_MINMAX__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vmul_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VMUL_MINMAX__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmul_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL_MINMAX__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmul_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL_MINMAX__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmul_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL_MINMAX__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vmul_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL_MINMAX__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vmul_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL_MINMAX__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vmul_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL_MINMAX__SCALAR_X8, qmin) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .qmin(128)
+      .Test(xnn_f32_vmul_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL_MINMAX__SCALAR_X8, qmax) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .qmax(128)
+      .Test(xnn_f32_vmul_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
 }
\ No newline at end of file
diff --git a/test/f32-vmul-minmax.yaml b/test/f32-vmul-minmax.yaml
index e424091..db3963b 100644
--- a/test/f32-vmul-minmax.yaml
+++ b/test/f32-vmul-minmax.yaml
@@ -12,11 +12,15 @@
 - name: xnn_f32_vmul_minmax_ukernel__avx512f_x32
 - name: xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vmul_minmax_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vmul_minmax_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vmul_minmax_ukernel__wasm_x1
 - name: xnn_f32_vmul_minmax_ukernel__wasm_x2
 - name: xnn_f32_vmul_minmax_ukernel__wasm_x4
+- name: xnn_f32_vmul_minmax_ukernel__wasm_x8
 - name: xnn_f32_vmul_minmax_ukernel__scalar_x1
 - name: xnn_f32_vmul_minmax_ukernel__scalar_x2
 - name: xnn_f32_vmul_minmax_ukernel__scalar_x4
+- name: xnn_f32_vmul_minmax_ukernel__scalar_x8
diff --git a/test/f32-vmul-relu.cc b/test/f32-vmul-relu.cc
index 414bfdf..de4b96e 100644
--- a/test/f32-vmul-relu.cc
+++ b/test/f32-vmul-relu.cc
@@ -139,6 +139,67 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMUL_RELU__WASMSIMD_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmul_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+  }
+
+  TEST(F32_VMUL_RELU__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_RELU__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_RELU__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_RELU__WASMSIMD_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vmul_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_RELU__WASMSIMD_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vmul_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_RELU__WASMSIMD_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vmul_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VMUL_RELU__WASM_X1, batch_eq_1) {
     VBinOpMicrokernelTester()
@@ -306,6 +367,67 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VMUL_RELU__WASM_X8, batch_eq_8) {
+    VBinOpMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vmul_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+  }
+
+  TEST(F32_VMUL_RELU__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_RELU__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_RELU__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_RELU__WASM_X8, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vmul_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_RELU__WASM_X8, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vmul_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL_RELU__WASM_X8, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vmul_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VMUL_RELU__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -465,3 +587,62 @@
       .Test(xnn_f32_vmul_relu_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VMUL_RELU__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vmul_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VMUL_RELU__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmul_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL_RELU__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmul_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL_RELU__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmul_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL_RELU__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vmul_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL_RELU__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vmul_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL_RELU__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vmul_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vmul-relu.yaml b/test/f32-vmul-relu.yaml
index 11bbddb..42042dd 100644
--- a/test/f32-vmul-relu.yaml
+++ b/test/f32-vmul-relu.yaml
@@ -4,9 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vmul_relu_ukernel__wasmsimd_x4
 - name: xnn_f32_vmul_relu_ukernel__wasmsimd_x8
+- name: xnn_f32_vmul_relu_ukernel__wasmsimd_x16
 - name: xnn_f32_vmul_relu_ukernel__wasm_x1
 - name: xnn_f32_vmul_relu_ukernel__wasm_x2
 - name: xnn_f32_vmul_relu_ukernel__wasm_x4
+- name: xnn_f32_vmul_relu_ukernel__wasm_x8
 - name: xnn_f32_vmul_relu_ukernel__scalar_x1
 - name: xnn_f32_vmul_relu_ukernel__scalar_x2
 - name: xnn_f32_vmul_relu_ukernel__scalar_x4
+- name: xnn_f32_vmul_relu_ukernel__scalar_x8
diff --git a/test/f32-vmul.cc b/test/f32-vmul.cc
index 99fdc10..d7e07e6 100644
--- a/test/f32-vmul.cc
+++ b/test/f32-vmul.cc
@@ -139,6 +139,67 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMUL__WASMSIMD_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmul_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+  }
+
+  TEST(F32_VMUL__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmul_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL__WASMSIMD_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vmul_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL__WASMSIMD_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vmul_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+
+  TEST(F32_VMUL__WASMSIMD_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vmul_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Mul);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VMUL__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -298,3 +359,62 @@
       .Test(xnn_f32_vmul_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VMUL__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vmul_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VMUL__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmul_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmul_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmul_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vmul_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vmul_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMUL__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vmul_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Mul, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vmul.yaml b/test/f32-vmul.yaml
index e6ca9b1..029ff21 100644
--- a/test/f32-vmul.yaml
+++ b/test/f32-vmul.yaml
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vmul_ukernel__wasmsimd_x4
 - name: xnn_f32_vmul_ukernel__wasmsimd_x8
+- name: xnn_f32_vmul_ukernel__wasmsimd_x16
 - name: xnn_f32_vmul_ukernel__scalar_x1
 - name: xnn_f32_vmul_ukernel__scalar_x2
 - name: xnn_f32_vmul_ukernel__scalar_x4
+- name: xnn_f32_vmul_ukernel__scalar_x8
diff --git a/test/f32-vmulc-minmax.cc b/test/f32-vmulc-minmax.cc
index debdbad..b13b550 100644
--- a/test/f32-vmulc-minmax.cc
+++ b/test/f32-vmulc-minmax.cc
@@ -674,6 +674,66 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VMULC_MINMAX__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MulC);
+  }
+
+  TEST(F32_VMULC_MINMAX__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASMSIMD_ARM_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASMSIMD_ARM_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASMSIMD_ARM_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VMULC_MINMAX__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpCMicrokernelTester()
       .batch_size(4)
@@ -793,6 +853,66 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMULC_MINMAX__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MulC);
+  }
+
+  TEST(F32_VMULC_MINMAX__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASMSIMD_X86_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASMSIMD_X86_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASMSIMD_X86_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VMULC_MINMAX__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -957,6 +1077,66 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VMULC_MINMAX__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vmulc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MulC);
+  }
+
+  TEST(F32_VMULC_MINMAX__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASM_X8, qmin) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_MINMAX__WASM_X8, qmax) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vmulc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VMULC_MINMAX__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -1110,4 +1290,61 @@
       .qmax(128)
       .Test(xnn_f32_vmulc_minmax_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
+}
+
+TEST(F32_VMULC_MINMAX__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vmulc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VMULC_MINMAX__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmulc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMULC_MINMAX__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmulc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMULC_MINMAX__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmulc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMULC_MINMAX__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vmulc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMULC_MINMAX__SCALAR_X8, qmin) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .qmin(128)
+      .Test(xnn_f32_vmulc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMULC_MINMAX__SCALAR_X8, qmax) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .qmax(128)
+      .Test(xnn_f32_vmulc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
 }
\ No newline at end of file
diff --git a/test/f32-vmulc-minmax.yaml b/test/f32-vmulc-minmax.yaml
index 81a8410..993f1e6 100644
--- a/test/f32-vmulc-minmax.yaml
+++ b/test/f32-vmulc-minmax.yaml
@@ -12,11 +12,15 @@
 - name: xnn_f32_vmulc_minmax_ukernel__avx512f_x32
 - name: xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vmulc_minmax_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vmulc_minmax_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vmulc_minmax_ukernel__wasm_x1
 - name: xnn_f32_vmulc_minmax_ukernel__wasm_x2
 - name: xnn_f32_vmulc_minmax_ukernel__wasm_x4
+- name: xnn_f32_vmulc_minmax_ukernel__wasm_x8
 - name: xnn_f32_vmulc_minmax_ukernel__scalar_x1
 - name: xnn_f32_vmulc_minmax_ukernel__scalar_x2
 - name: xnn_f32_vmulc_minmax_ukernel__scalar_x4
+- name: xnn_f32_vmulc_minmax_ukernel__scalar_x8
diff --git a/test/f32-vmulc-relu.cc b/test/f32-vmulc-relu.cc
index cf98b8e..dcaeae7 100644
--- a/test/f32-vmulc-relu.cc
+++ b/test/f32-vmulc-relu.cc
@@ -101,6 +101,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMULC_RELU__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmulc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::MulC);
+  }
+
+  TEST(F32_VMULC_RELU__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_RELU__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_RELU__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_RELU__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vmulc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VMULC_RELU__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -211,6 +253,48 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VMULC_RELU__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vmulc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MulC);
+  }
+
+  TEST(F32_VMULC_RELU__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_RELU__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_RELU__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC_RELU__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vmulc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VMULC_RELU__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -313,3 +397,43 @@
       .Test(xnn_f32_vmulc_relu_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VMULC_RELU__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vmulc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VMULC_RELU__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmulc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMULC_RELU__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmulc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMULC_RELU__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmulc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMULC_RELU__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vmulc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vmulc-relu.yaml b/test/f32-vmulc-relu.yaml
index fa19514..948f2d8 100644
--- a/test/f32-vmulc-relu.yaml
+++ b/test/f32-vmulc-relu.yaml
@@ -4,9 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vmulc_relu_ukernel__wasmsimd_x4
 - name: xnn_f32_vmulc_relu_ukernel__wasmsimd_x8
+- name: xnn_f32_vmulc_relu_ukernel__wasmsimd_x16
 - name: xnn_f32_vmulc_relu_ukernel__wasm_x1
 - name: xnn_f32_vmulc_relu_ukernel__wasm_x2
 - name: xnn_f32_vmulc_relu_ukernel__wasm_x4
+- name: xnn_f32_vmulc_relu_ukernel__wasm_x8
 - name: xnn_f32_vmulc_relu_ukernel__scalar_x1
 - name: xnn_f32_vmulc_relu_ukernel__scalar_x2
 - name: xnn_f32_vmulc_relu_ukernel__scalar_x4
+- name: xnn_f32_vmulc_relu_ukernel__scalar_x8
diff --git a/test/f32-vmulc.cc b/test/f32-vmulc.cc
index 8dbee07..fe9488b 100644
--- a/test/f32-vmulc.cc
+++ b/test/f32-vmulc.cc
@@ -101,6 +101,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VMULC__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vmulc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::MulC);
+  }
+
+  TEST(F32_VMULC__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vmulc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+
+  TEST(F32_VMULC__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vmulc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::MulC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VMULC__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -203,3 +245,43 @@
       .Test(xnn_f32_vmulc_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VMULC__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vmulc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VMULC__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmulc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMULC__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmulc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMULC__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vmulc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VMULC__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vmulc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::MulC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vmulc.yaml b/test/f32-vmulc.yaml
index ecd9fde..b39f874 100644
--- a/test/f32-vmulc.yaml
+++ b/test/f32-vmulc.yaml
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vmulc_ukernel__wasmsimd_x4
 - name: xnn_f32_vmulc_ukernel__wasmsimd_x8
+- name: xnn_f32_vmulc_ukernel__wasmsimd_x16
 - name: xnn_f32_vmulc_ukernel__scalar_x1
 - name: xnn_f32_vmulc_ukernel__scalar_x2
 - name: xnn_f32_vmulc_ukernel__scalar_x4
+- name: xnn_f32_vmulc_ukernel__scalar_x8
diff --git a/test/f32-vrdivc-minmax.cc b/test/f32-vrdivc-minmax.cc
index ab4d420..3964824 100644
--- a/test/f32-vrdivc-minmax.cc
+++ b/test/f32-vrdivc-minmax.cc
@@ -674,6 +674,66 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_ARM_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_ARM_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_ARM_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VRDIVC_MINMAX__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpCMicrokernelTester()
       .batch_size(4)
@@ -793,6 +853,66 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_X86_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_X86_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASMSIMD_X86_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VRDIVC_MINMAX__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -957,6 +1077,66 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VRDIVC_MINMAX__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vrdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RDivC);
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASM_X8, qmin) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_MINMAX__WASM_X8, qmax) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vrdivc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VRDIVC_MINMAX__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -1110,4 +1290,61 @@
       .qmax(128)
       .Test(xnn_f32_vrdivc_minmax_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
+}
+
+TEST(F32_VRDIVC_MINMAX__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vrdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VRDIVC_MINMAX__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRDIVC_MINMAX__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRDIVC_MINMAX__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRDIVC_MINMAX__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vrdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRDIVC_MINMAX__SCALAR_X8, qmin) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .qmin(128)
+      .Test(xnn_f32_vrdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRDIVC_MINMAX__SCALAR_X8, qmax) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .qmax(128)
+      .Test(xnn_f32_vrdivc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
 }
\ No newline at end of file
diff --git a/test/f32-vrdivc-minmax.yaml b/test/f32-vrdivc-minmax.yaml
index 649ced6..50635bb 100644
--- a/test/f32-vrdivc-minmax.yaml
+++ b/test/f32-vrdivc-minmax.yaml
@@ -16,11 +16,15 @@
 - name: xnn_f32_vrdivc_minmax_ukernel__avx512f_x32
 - name: xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vrdivc_minmax_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vrdivc_minmax_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vrdivc_minmax_ukernel__wasm_x1
 - name: xnn_f32_vrdivc_minmax_ukernel__wasm_x2
 - name: xnn_f32_vrdivc_minmax_ukernel__wasm_x4
+- name: xnn_f32_vrdivc_minmax_ukernel__wasm_x8
 - name: xnn_f32_vrdivc_minmax_ukernel__scalar_x1
 - name: xnn_f32_vrdivc_minmax_ukernel__scalar_x2
 - name: xnn_f32_vrdivc_minmax_ukernel__scalar_x4
+- name: xnn_f32_vrdivc_minmax_ukernel__scalar_x8
diff --git a/test/f32-vrdivc-relu.cc b/test/f32-vrdivc-relu.cc
index 0eb043d..a2efc96 100644
--- a/test/f32-vrdivc-relu.cc
+++ b/test/f32-vrdivc-relu.cc
@@ -101,6 +101,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VRDIVC_RELU__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vrdivc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+  }
+
+  TEST(F32_VRDIVC_RELU__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_RELU__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_RELU__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_RELU__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vrdivc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VRDIVC_RELU__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -211,6 +253,48 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VRDIVC_RELU__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vrdivc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RDivC);
+  }
+
+  TEST(F32_VRDIVC_RELU__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_RELU__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_RELU__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC_RELU__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vrdivc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VRDIVC_RELU__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -313,3 +397,43 @@
       .Test(xnn_f32_vrdivc_relu_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VRDIVC_RELU__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vrdivc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VRDIVC_RELU__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrdivc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRDIVC_RELU__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrdivc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRDIVC_RELU__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrdivc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRDIVC_RELU__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vrdivc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vrdivc-relu.yaml b/test/f32-vrdivc-relu.yaml
index b68e96b..eba90a6 100644
--- a/test/f32-vrdivc-relu.yaml
+++ b/test/f32-vrdivc-relu.yaml
@@ -4,9 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vrdivc_relu_ukernel__wasmsimd_x4
 - name: xnn_f32_vrdivc_relu_ukernel__wasmsimd_x8
+- name: xnn_f32_vrdivc_relu_ukernel__wasmsimd_x16
 - name: xnn_f32_vrdivc_relu_ukernel__wasm_x1
 - name: xnn_f32_vrdivc_relu_ukernel__wasm_x2
 - name: xnn_f32_vrdivc_relu_ukernel__wasm_x4
+- name: xnn_f32_vrdivc_relu_ukernel__wasm_x8
 - name: xnn_f32_vrdivc_relu_ukernel__scalar_x1
 - name: xnn_f32_vrdivc_relu_ukernel__scalar_x2
 - name: xnn_f32_vrdivc_relu_ukernel__scalar_x4
+- name: xnn_f32_vrdivc_relu_ukernel__scalar_x8
diff --git a/test/f32-vrdivc.cc b/test/f32-vrdivc.cc
index cb99b1b..308d3d9 100644
--- a/test/f32-vrdivc.cc
+++ b/test/f32-vrdivc.cc
@@ -101,6 +101,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VRDIVC__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vrdivc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+  }
+
+  TEST(F32_VRDIVC__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrdivc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+
+  TEST(F32_VRDIVC__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vrdivc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RDivC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VRDIVC__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -203,3 +245,43 @@
       .Test(xnn_f32_vrdivc_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VRDIVC__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vrdivc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VRDIVC__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrdivc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRDIVC__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrdivc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRDIVC__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrdivc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRDIVC__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vrdivc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RDivC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vrdivc.yaml b/test/f32-vrdivc.yaml
index 1b7e751..f82644a 100644
--- a/test/f32-vrdivc.yaml
+++ b/test/f32-vrdivc.yaml
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vrdivc_ukernel__wasmsimd_x4
 - name: xnn_f32_vrdivc_ukernel__wasmsimd_x8
+- name: xnn_f32_vrdivc_ukernel__wasmsimd_x16
 - name: xnn_f32_vrdivc_ukernel__scalar_x1
 - name: xnn_f32_vrdivc_ukernel__scalar_x2
 - name: xnn_f32_vrdivc_ukernel__scalar_x4
+- name: xnn_f32_vrdivc_ukernel__scalar_x8
diff --git a/test/f32-vrsubc-minmax.cc b/test/f32-vrsubc-minmax.cc
index e6fe5af..000458e 100644
--- a/test/f32-vrsubc-minmax.cc
+++ b/test/f32-vrsubc-minmax.cc
@@ -674,6 +674,66 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_ARM_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_ARM_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_ARM_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VRSUBC_MINMAX__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpCMicrokernelTester()
       .batch_size(4)
@@ -793,6 +853,66 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_X86_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_X86_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASMSIMD_X86_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VRSUBC_MINMAX__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -957,6 +1077,66 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VRSUBC_MINMAX__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vrsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RSubC);
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASM_X8, qmin) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_MINMAX__WASM_X8, qmax) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vrsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VRSUBC_MINMAX__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -1110,4 +1290,61 @@
       .qmax(128)
       .Test(xnn_f32_vrsubc_minmax_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
+}
+
+TEST(F32_VRSUBC_MINMAX__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vrsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VRSUBC_MINMAX__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRSUBC_MINMAX__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRSUBC_MINMAX__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRSUBC_MINMAX__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vrsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRSUBC_MINMAX__SCALAR_X8, qmin) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .qmin(128)
+      .Test(xnn_f32_vrsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRSUBC_MINMAX__SCALAR_X8, qmax) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .qmax(128)
+      .Test(xnn_f32_vrsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
 }
\ No newline at end of file
diff --git a/test/f32-vrsubc-minmax.yaml b/test/f32-vrsubc-minmax.yaml
index 32a4c25..2f2988d 100644
--- a/test/f32-vrsubc-minmax.yaml
+++ b/test/f32-vrsubc-minmax.yaml
@@ -12,11 +12,15 @@
 - name: xnn_f32_vrsubc_minmax_ukernel__avx512f_x32
 - name: xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vrsubc_minmax_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vrsubc_minmax_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vrsubc_minmax_ukernel__wasm_x1
 - name: xnn_f32_vrsubc_minmax_ukernel__wasm_x2
 - name: xnn_f32_vrsubc_minmax_ukernel__wasm_x4
+- name: xnn_f32_vrsubc_minmax_ukernel__wasm_x8
 - name: xnn_f32_vrsubc_minmax_ukernel__scalar_x1
 - name: xnn_f32_vrsubc_minmax_ukernel__scalar_x2
 - name: xnn_f32_vrsubc_minmax_ukernel__scalar_x4
+- name: xnn_f32_vrsubc_minmax_ukernel__scalar_x8
diff --git a/test/f32-vrsubc-relu.cc b/test/f32-vrsubc-relu.cc
index f549f84..ff76b5d 100644
--- a/test/f32-vrsubc-relu.cc
+++ b/test/f32-vrsubc-relu.cc
@@ -101,6 +101,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VRSUBC_RELU__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vrsubc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+  }
+
+  TEST(F32_VRSUBC_RELU__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_RELU__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_RELU__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_RELU__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vrsubc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VRSUBC_RELU__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -211,6 +253,48 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VRSUBC_RELU__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vrsubc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RSubC);
+  }
+
+  TEST(F32_VRSUBC_RELU__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_RELU__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_RELU__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC_RELU__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vrsubc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VRSUBC_RELU__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -313,3 +397,43 @@
       .Test(xnn_f32_vrsubc_relu_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VRSUBC_RELU__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vrsubc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VRSUBC_RELU__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrsubc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRSUBC_RELU__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrsubc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRSUBC_RELU__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrsubc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRSUBC_RELU__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vrsubc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vrsubc-relu.yaml b/test/f32-vrsubc-relu.yaml
index 0a7a3b6..e037af2 100644
--- a/test/f32-vrsubc-relu.yaml
+++ b/test/f32-vrsubc-relu.yaml
@@ -4,9 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vrsubc_relu_ukernel__wasmsimd_x4
 - name: xnn_f32_vrsubc_relu_ukernel__wasmsimd_x8
+- name: xnn_f32_vrsubc_relu_ukernel__wasmsimd_x16
 - name: xnn_f32_vrsubc_relu_ukernel__wasm_x1
 - name: xnn_f32_vrsubc_relu_ukernel__wasm_x2
 - name: xnn_f32_vrsubc_relu_ukernel__wasm_x4
+- name: xnn_f32_vrsubc_relu_ukernel__wasm_x8
 - name: xnn_f32_vrsubc_relu_ukernel__scalar_x1
 - name: xnn_f32_vrsubc_relu_ukernel__scalar_x2
 - name: xnn_f32_vrsubc_relu_ukernel__scalar_x4
+- name: xnn_f32_vrsubc_relu_ukernel__scalar_x8
diff --git a/test/f32-vrsubc.cc b/test/f32-vrsubc.cc
index cdbcffe..33540f5 100644
--- a/test/f32-vrsubc.cc
+++ b/test/f32-vrsubc.cc
@@ -101,6 +101,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VRSUBC__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vrsubc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+  }
+
+  TEST(F32_VRSUBC__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vrsubc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+
+  TEST(F32_VRSUBC__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vrsubc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::RSubC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VRSUBC__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -203,3 +245,43 @@
       .Test(xnn_f32_vrsubc_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VRSUBC__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vrsubc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VRSUBC__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrsubc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRSUBC__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrsubc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRSUBC__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vrsubc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VRSUBC__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vrsubc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::RSubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vrsubc.yaml b/test/f32-vrsubc.yaml
index f324e6b..2fdbed3 100644
--- a/test/f32-vrsubc.yaml
+++ b/test/f32-vrsubc.yaml
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vrsubc_ukernel__wasmsimd_x4
 - name: xnn_f32_vrsubc_ukernel__wasmsimd_x8
+- name: xnn_f32_vrsubc_ukernel__wasmsimd_x16
 - name: xnn_f32_vrsubc_ukernel__scalar_x1
 - name: xnn_f32_vrsubc_ukernel__scalar_x2
 - name: xnn_f32_vrsubc_ukernel__scalar_x4
+- name: xnn_f32_vrsubc_ukernel__scalar_x8
diff --git a/test/f32-vsqrdiff.cc b/test/f32-vsqrdiff.cc
index b0a049d..a1909e7 100644
--- a/test/f32-vsqrdiff.cc
+++ b/test/f32-vsqrdiff.cc
@@ -683,6 +683,67 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VSQRDIFF__WASMSIMD_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vsqrdiff_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::SqrDiff);
+  }
+
+  TEST(F32_VSQRDIFF__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsqrdiff_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::SqrDiff);
+    }
+  }
+
+  TEST(F32_VSQRDIFF__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsqrdiff_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::SqrDiff);
+    }
+  }
+
+  TEST(F32_VSQRDIFF__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsqrdiff_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::SqrDiff);
+    }
+  }
+
+  TEST(F32_VSQRDIFF__WASMSIMD_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vsqrdiff_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::SqrDiff);
+    }
+  }
+
+  TEST(F32_VSQRDIFF__WASMSIMD_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vsqrdiff_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::SqrDiff);
+    }
+  }
+
+  TEST(F32_VSQRDIFF__WASMSIMD_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vsqrdiff_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::SqrDiff);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VSQRDIFF__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -842,3 +903,62 @@
       .Test(xnn_f32_vsqrdiff_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::SqrDiff, VBinOpMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VSQRDIFF__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vsqrdiff_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::SqrDiff, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VSQRDIFF__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsqrdiff_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::SqrDiff, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSQRDIFF__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsqrdiff_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::SqrDiff, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSQRDIFF__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsqrdiff_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::SqrDiff, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSQRDIFF__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vsqrdiff_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::SqrDiff, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSQRDIFF__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vsqrdiff_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::SqrDiff, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSQRDIFF__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vsqrdiff_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::SqrDiff, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vsqrdiff.yaml b/test/f32-vsqrdiff.yaml
index 55729e9..1f006cd 100644
--- a/test/f32-vsqrdiff.yaml
+++ b/test/f32-vsqrdiff.yaml
@@ -12,6 +12,8 @@
 - name: xnn_f32_vsqrdiff_ukernel__avx512f_x32
 - name: xnn_f32_vsqrdiff_ukernel__wasmsimd_x4
 - name: xnn_f32_vsqrdiff_ukernel__wasmsimd_x8
+- name: xnn_f32_vsqrdiff_ukernel__wasmsimd_x16
 - name: xnn_f32_vsqrdiff_ukernel__scalar_x1
 - name: xnn_f32_vsqrdiff_ukernel__scalar_x2
 - name: xnn_f32_vsqrdiff_ukernel__scalar_x4
+- name: xnn_f32_vsqrdiff_ukernel__scalar_x8
diff --git a/test/f32-vsqrdiffc.cc b/test/f32-vsqrdiffc.cc
index 0aea0b6..39137c3 100644
--- a/test/f32-vsqrdiffc.cc
+++ b/test/f32-vsqrdiffc.cc
@@ -477,6 +477,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VSQRDIFFC__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SqrDiffC);
+  }
+
+  TEST(F32_VSQRDIFFC__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SqrDiffC);
+    }
+  }
+
+  TEST(F32_VSQRDIFFC__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SqrDiffC);
+    }
+  }
+
+  TEST(F32_VSQRDIFFC__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SqrDiffC);
+    }
+  }
+
+  TEST(F32_VSQRDIFFC__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SqrDiffC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VSQRDIFFC__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -579,3 +621,43 @@
       .Test(xnn_f32_vsqrdiffc_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::SqrDiffC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VSQRDIFFC__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vsqrdiffc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SqrDiffC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VSQRDIFFC__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsqrdiffc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SqrDiffC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSQRDIFFC__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsqrdiffc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SqrDiffC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSQRDIFFC__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsqrdiffc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SqrDiffC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSQRDIFFC__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vsqrdiffc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SqrDiffC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vsqrdiffc.yaml b/test/f32-vsqrdiffc.yaml
index 633d188..7ca7351 100644
--- a/test/f32-vsqrdiffc.yaml
+++ b/test/f32-vsqrdiffc.yaml
@@ -12,6 +12,8 @@
 - name: xnn_f32_vsqrdiffc_ukernel__avx512f_x32
 - name: xnn_f32_vsqrdiffc_ukernel__wasmsimd_x4
 - name: xnn_f32_vsqrdiffc_ukernel__wasmsimd_x8
+- name: xnn_f32_vsqrdiffc_ukernel__wasmsimd_x16
 - name: xnn_f32_vsqrdiffc_ukernel__scalar_x1
 - name: xnn_f32_vsqrdiffc_ukernel__scalar_x2
 - name: xnn_f32_vsqrdiffc_ukernel__scalar_x4
+- name: xnn_f32_vsqrdiffc_ukernel__scalar_x8
diff --git a/test/f32-vsub-minmax.cc b/test/f32-vsub-minmax.cc
index 6728bb9..f41e64f 100644
--- a/test/f32-vsub-minmax.cc
+++ b/test/f32-vsub-minmax.cc
@@ -880,6 +880,85 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VSUB_MINMAX__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Sub);
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_ARM_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_ARM_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_ARM_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_ARM_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_ARM_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VSUB_MINMAX__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpMicrokernelTester()
       .batch_size(4)
@@ -1037,6 +1116,85 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VSUB_MINMAX__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Sub);
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_X86_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_X86_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_X86_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_X86_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASMSIMD_X86_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VSUB_MINMAX__WASM_X1, batch_eq_1) {
     VBinOpMicrokernelTester()
@@ -1258,6 +1416,85 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VSUB_MINMAX__WASM_X8, batch_eq_8) {
+    VBinOpMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vsub_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+  }
+
+  TEST(F32_VSUB_MINMAX__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASM_X8, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASM_X8, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASM_X8, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASM_X8, qmin) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_MINMAX__WASM_X8, qmax) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vsub_minmax_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VSUB_MINMAX__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -1468,4 +1705,80 @@
       .qmax(128)
       .Test(xnn_f32_vsub_minmax_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
   }
+}
+
+TEST(F32_VSUB_MINMAX__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vsub_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VSUB_MINMAX__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsub_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB_MINMAX__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsub_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB_MINMAX__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsub_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB_MINMAX__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vsub_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB_MINMAX__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vsub_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB_MINMAX__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vsub_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB_MINMAX__SCALAR_X8, qmin) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .qmin(128)
+      .Test(xnn_f32_vsub_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB_MINMAX__SCALAR_X8, qmax) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .qmax(128)
+      .Test(xnn_f32_vsub_minmax_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
 }
\ No newline at end of file
diff --git a/test/f32-vsub-minmax.yaml b/test/f32-vsub-minmax.yaml
index 1429efb..b23a952 100644
--- a/test/f32-vsub-minmax.yaml
+++ b/test/f32-vsub-minmax.yaml
@@ -12,11 +12,15 @@
 - name: xnn_f32_vsub_minmax_ukernel__avx512f_x32
 - name: xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vsub_minmax_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vsub_minmax_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vsub_minmax_ukernel__wasm_x1
 - name: xnn_f32_vsub_minmax_ukernel__wasm_x2
 - name: xnn_f32_vsub_minmax_ukernel__wasm_x4
+- name: xnn_f32_vsub_minmax_ukernel__wasm_x8
 - name: xnn_f32_vsub_minmax_ukernel__scalar_x1
 - name: xnn_f32_vsub_minmax_ukernel__scalar_x2
 - name: xnn_f32_vsub_minmax_ukernel__scalar_x4
+- name: xnn_f32_vsub_minmax_ukernel__scalar_x8
diff --git a/test/f32-vsub-relu.cc b/test/f32-vsub-relu.cc
index 9cd9400..1e59a8a 100644
--- a/test/f32-vsub-relu.cc
+++ b/test/f32-vsub-relu.cc
@@ -139,6 +139,67 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VSUB_RELU__WASMSIMD_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vsub_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+  }
+
+  TEST(F32_VSUB_RELU__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_RELU__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_RELU__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_RELU__WASMSIMD_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vsub_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_RELU__WASMSIMD_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vsub_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_RELU__WASMSIMD_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vsub_relu_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VSUB_RELU__WASM_X1, batch_eq_1) {
     VBinOpMicrokernelTester()
@@ -306,6 +367,67 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VSUB_RELU__WASM_X8, batch_eq_8) {
+    VBinOpMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vsub_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+  }
+
+  TEST(F32_VSUB_RELU__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_RELU__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_RELU__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_RELU__WASM_X8, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vsub_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_RELU__WASM_X8, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vsub_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB_RELU__WASM_X8, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vsub_relu_ukernel__wasm_x8, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VSUB_RELU__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -465,3 +587,62 @@
       .Test(xnn_f32_vsub_relu_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VSUB_RELU__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vsub_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VSUB_RELU__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsub_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB_RELU__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsub_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB_RELU__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsub_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB_RELU__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vsub_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB_RELU__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vsub_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB_RELU__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vsub_relu_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vsub-relu.yaml b/test/f32-vsub-relu.yaml
index e1d90cf..ae6955e 100644
--- a/test/f32-vsub-relu.yaml
+++ b/test/f32-vsub-relu.yaml
@@ -4,9 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vsub_relu_ukernel__wasmsimd_x4
 - name: xnn_f32_vsub_relu_ukernel__wasmsimd_x8
+- name: xnn_f32_vsub_relu_ukernel__wasmsimd_x16
 - name: xnn_f32_vsub_relu_ukernel__wasm_x1
 - name: xnn_f32_vsub_relu_ukernel__wasm_x2
 - name: xnn_f32_vsub_relu_ukernel__wasm_x4
+- name: xnn_f32_vsub_relu_ukernel__wasm_x8
 - name: xnn_f32_vsub_relu_ukernel__scalar_x1
 - name: xnn_f32_vsub_relu_ukernel__scalar_x2
 - name: xnn_f32_vsub_relu_ukernel__scalar_x4
+- name: xnn_f32_vsub_relu_ukernel__scalar_x8
diff --git a/test/f32-vsub.cc b/test/f32-vsub.cc
index ff4f4f5..d656943 100644
--- a/test/f32-vsub.cc
+++ b/test/f32-vsub.cc
@@ -139,6 +139,67 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VSUB__WASMSIMD_X16, batch_eq_16) {
+    VBinOpMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vsub_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+  }
+
+  TEST(F32_VSUB__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsub_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB__WASMSIMD_X16, inplace_a) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .Test(xnn_f32_vsub_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB__WASMSIMD_X16, inplace_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_b(true)
+        .Test(xnn_f32_vsub_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+
+  TEST(F32_VSUB__WASMSIMD_X16, inplace_a_and_b) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace_a(true)
+        .inplace_b(true)
+        .Test(xnn_f32_vsub_ukernel__wasmsimd_x16, VBinOpMicrokernelTester::OpType::Sub);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VSUB__SCALAR_X1, batch_eq_1) {
   VBinOpMicrokernelTester()
     .batch_size(1)
@@ -298,3 +359,62 @@
       .Test(xnn_f32_vsub_ukernel__scalar_x4, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VSUB__SCALAR_X8, batch_eq_8) {
+  VBinOpMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vsub_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VSUB__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsub_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsub_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsub_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB__SCALAR_X8, inplace_a) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .Test(xnn_f32_vsub_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB__SCALAR_X8, inplace_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_b(true)
+      .Test(xnn_f32_vsub_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUB__SCALAR_X8, inplace_a_and_b) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace_a(true)
+      .inplace_b(true)
+      .Test(xnn_f32_vsub_ukernel__scalar_x8, VBinOpMicrokernelTester::OpType::Sub, VBinOpMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vsub.yaml b/test/f32-vsub.yaml
index d8be8a7..0555129 100644
--- a/test/f32-vsub.yaml
+++ b/test/f32-vsub.yaml
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vsub_ukernel__wasmsimd_x4
 - name: xnn_f32_vsub_ukernel__wasmsimd_x8
+- name: xnn_f32_vsub_ukernel__wasmsimd_x16
 - name: xnn_f32_vsub_ukernel__scalar_x1
 - name: xnn_f32_vsub_ukernel__scalar_x2
 - name: xnn_f32_vsub_ukernel__scalar_x4
+- name: xnn_f32_vsub_ukernel__scalar_x8
diff --git a/test/f32-vsubc-minmax.cc b/test/f32-vsubc-minmax.cc
index dde1c9f..9a5c789 100644
--- a/test/f32-vsubc-minmax.cc
+++ b/test/f32-vsubc-minmax.cc
@@ -674,6 +674,66 @@
 
 
 #if XNN_ARCH_WASMSIMD
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_ARM_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::SubC);
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_ARM_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_ARM_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_ARM_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_ARM_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_ARM_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_ARM_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
+#if XNN_ARCH_WASMSIMD
   TEST(F32_VSUBC_MINMAX__WASMSIMD_X86_X4, batch_eq_4) {
     VBinOpCMicrokernelTester()
       .batch_size(4)
@@ -793,6 +853,66 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_X86_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::SubC);
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_X86_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_X86_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_X86_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_X86_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_X86_X16, qmin) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASMSIMD_X86_X16, qmax) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VSUBC_MINMAX__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -957,6 +1077,66 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VSUBC_MINMAX__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::SubC);
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASM_X8, qmin) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmin(128)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_MINMAX__WASM_X8, qmax) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .qmax(128)
+        .Test(xnn_f32_vsubc_minmax_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VSUBC_MINMAX__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -1110,4 +1290,61 @@
       .qmax(128)
       .Test(xnn_f32_vsubc_minmax_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
+}
+
+TEST(F32_VSUBC_MINMAX__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VSUBC_MINMAX__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUBC_MINMAX__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUBC_MINMAX__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUBC_MINMAX__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUBC_MINMAX__SCALAR_X8, qmin) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .qmin(128)
+      .Test(xnn_f32_vsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUBC_MINMAX__SCALAR_X8, qmax) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .qmax(128)
+      .Test(xnn_f32_vsubc_minmax_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
 }
\ No newline at end of file
diff --git a/test/f32-vsubc-minmax.yaml b/test/f32-vsubc-minmax.yaml
index 0d54c31..ccc204a 100644
--- a/test/f32-vsubc-minmax.yaml
+++ b/test/f32-vsubc-minmax.yaml
@@ -12,11 +12,15 @@
 - name: xnn_f32_vsubc_minmax_ukernel__avx512f_x32
 - name: xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x4
 - name: xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x8
+- name: xnn_f32_vsubc_minmax_ukernel__wasmsimd_arm_x16
 - name: xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x4
 - name: xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x8
+- name: xnn_f32_vsubc_minmax_ukernel__wasmsimd_x86_x16
 - name: xnn_f32_vsubc_minmax_ukernel__wasm_x1
 - name: xnn_f32_vsubc_minmax_ukernel__wasm_x2
 - name: xnn_f32_vsubc_minmax_ukernel__wasm_x4
+- name: xnn_f32_vsubc_minmax_ukernel__wasm_x8
 - name: xnn_f32_vsubc_minmax_ukernel__scalar_x1
 - name: xnn_f32_vsubc_minmax_ukernel__scalar_x2
 - name: xnn_f32_vsubc_minmax_ukernel__scalar_x4
+- name: xnn_f32_vsubc_minmax_ukernel__scalar_x8
diff --git a/test/f32-vsubc-relu.cc b/test/f32-vsubc-relu.cc
index e76adcd..3d51832 100644
--- a/test/f32-vsubc-relu.cc
+++ b/test/f32-vsubc-relu.cc
@@ -101,6 +101,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VSUBC_RELU__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vsubc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SubC);
+  }
+
+  TEST(F32_VSUBC_RELU__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_RELU__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_RELU__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_RELU__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vsubc_relu_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 #if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
   TEST(F32_VSUBC_RELU__WASM_X1, batch_eq_1) {
     VBinOpCMicrokernelTester()
@@ -211,6 +253,48 @@
 #endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+  TEST(F32_VSUBC_RELU__WASM_X8, batch_eq_8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(8)
+      .Test(xnn_f32_vsubc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::SubC);
+  }
+
+  TEST(F32_VSUBC_RELU__WASM_X8, batch_div_8) {
+    for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_RELU__WASM_X8, batch_lt_8) {
+    for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_RELU__WASM_X8, batch_gt_8) {
+    for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC_RELU__WASM_X8, inplace) {
+    for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vsubc_relu_ukernel__wasm_x8, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+#endif  // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VSUBC_RELU__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -313,3 +397,43 @@
       .Test(xnn_f32_vsubc_relu_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VSUBC_RELU__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vsubc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VSUBC_RELU__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsubc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUBC_RELU__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsubc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUBC_RELU__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsubc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUBC_RELU__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vsubc_relu_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vsubc-relu.yaml b/test/f32-vsubc-relu.yaml
index 862ecb4..39e5826 100644
--- a/test/f32-vsubc-relu.yaml
+++ b/test/f32-vsubc-relu.yaml
@@ -4,9 +4,12 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vsubc_relu_ukernel__wasmsimd_x4
 - name: xnn_f32_vsubc_relu_ukernel__wasmsimd_x8
+- name: xnn_f32_vsubc_relu_ukernel__wasmsimd_x16
 - name: xnn_f32_vsubc_relu_ukernel__wasm_x1
 - name: xnn_f32_vsubc_relu_ukernel__wasm_x2
 - name: xnn_f32_vsubc_relu_ukernel__wasm_x4
+- name: xnn_f32_vsubc_relu_ukernel__wasm_x8
 - name: xnn_f32_vsubc_relu_ukernel__scalar_x1
 - name: xnn_f32_vsubc_relu_ukernel__scalar_x2
 - name: xnn_f32_vsubc_relu_ukernel__scalar_x4
+- name: xnn_f32_vsubc_relu_ukernel__scalar_x8
diff --git a/test/f32-vsubc.cc b/test/f32-vsubc.cc
index e12b9a4..829d857 100644
--- a/test/f32-vsubc.cc
+++ b/test/f32-vsubc.cc
@@ -101,6 +101,48 @@
 #endif  // XNN_ARCH_WASMSIMD
 
 
+#if XNN_ARCH_WASMSIMD
+  TEST(F32_VSUBC__WASMSIMD_X16, batch_eq_16) {
+    VBinOpCMicrokernelTester()
+      .batch_size(16)
+      .Test(xnn_f32_vsubc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SubC);
+  }
+
+  TEST(F32_VSUBC__WASMSIMD_X16, batch_div_16) {
+    for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC__WASMSIMD_X16, batch_lt_16) {
+    for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC__WASMSIMD_X16, batch_gt_16) {
+    for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .Test(xnn_f32_vsubc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+
+  TEST(F32_VSUBC__WASMSIMD_X16, inplace) {
+    for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+      VBinOpCMicrokernelTester()
+        .batch_size(batch_size)
+        .inplace(true)
+        .Test(xnn_f32_vsubc_ukernel__wasmsimd_x16, VBinOpCMicrokernelTester::OpType::SubC);
+    }
+  }
+#endif  // XNN_ARCH_WASMSIMD
+
+
 TEST(F32_VSUBC__SCALAR_X1, batch_eq_1) {
   VBinOpCMicrokernelTester()
     .batch_size(1)
@@ -203,3 +245,43 @@
       .Test(xnn_f32_vsubc_ukernel__scalar_x4, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
   }
 }
+
+
+TEST(F32_VSUBC__SCALAR_X8, batch_eq_8) {
+  VBinOpCMicrokernelTester()
+    .batch_size(8)
+    .Test(xnn_f32_vsubc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+}
+
+TEST(F32_VSUBC__SCALAR_X8, batch_div_8) {
+  for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsubc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUBC__SCALAR_X8, batch_lt_8) {
+  for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsubc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUBC__SCALAR_X8, batch_gt_8) {
+  for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .Test(xnn_f32_vsubc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
+
+TEST(F32_VSUBC__SCALAR_X8, inplace) {
+  for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+    VBinOpCMicrokernelTester()
+      .batch_size(batch_size)
+      .inplace(true)
+      .Test(xnn_f32_vsubc_ukernel__scalar_x8, VBinOpCMicrokernelTester::OpType::SubC, VBinOpCMicrokernelTester::Variant::Scalar);
+  }
+}
diff --git a/test/f32-vsubc.yaml b/test/f32-vsubc.yaml
index 3cb381a..35b86c3 100644
--- a/test/f32-vsubc.yaml
+++ b/test/f32-vsubc.yaml
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 - name: xnn_f32_vsubc_ukernel__wasmsimd_x4
 - name: xnn_f32_vsubc_ukernel__wasmsimd_x8
+- name: xnn_f32_vsubc_ukernel__wasmsimd_x16
 - name: xnn_f32_vsubc_ukernel__scalar_x1
 - name: xnn_f32_vsubc_ukernel__scalar_x2
 - name: xnn_f32_vsubc_ukernel__scalar_x4
+- name: xnn_f32_vsubc_ukernel__scalar_x8