Separate WAsm micro-kernels and scalar micro-kernels - WAsm-specific microkernels explicitly use f32.min/f32.max WAsm instructions - About 2% end-to-end improvement on x86, no change on ARM64 PiperOrigin-RevId: 283845483

commit: 436ebe6cc2a7a6cc746ac4bcb8cf95f665ae6c29 [log] [tgz]
author: Marat Dukhan <maratek@google.com> Wed Dec 04 15:10:12 2019 -0800
committer: XNNPACK Team <xnnpack-github-robot@google.com> Wed Dec 04 15:10:53 2019 -0800
tree: 367cdc7cbcbcaa139c4a55fc72e2c4b26b7bdfdb
parent: 05f3f6dc940ea45796c009bd09779f597a99151d [diff]
diff --git a/src/f32-avgpool/mp9p8q-wasm.c b/src/f32-avgpool/mp9p8q-wasm.c
new file mode 100644
index 0000000..415b8fb
--- /dev/null
+++ b/src/f32-avgpool/mp9p8q-wasm.c

@@ -0,0 +1,170 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/avgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_avgpool_ukernel_mp9p8q__wasm(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const float** input,
+    const float* zero,
+    float* buffer,
+    float* output,
+    size_t input_increment,
+    size_t output_increment,
+    const union xnn_f32_avgpool_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(ks > 9);
+  assert(kc != 0);
+
+  const float vmultiplier = params->scalar.multiplier;
+  const float voutput_min = params->scalar.output_min;
+  const float voutput_max = params->scalar.output_max;
+
+  do {
+    {
+      const float* i0 = *input++;
+      const float* i1 = *input++;
+      const float* i2 = *input++;
+      const float* i3 = *input++;
+      const float* i4 = *input++;
+      const float* i5 = *input++;
+      const float* i6 = *input++;
+      const float* i7 = *input++;
+      const float* i8 = *input++;
+
+      float* b = buffer;
+      size_t k = kc;
+      do {
+        const float vi0 = *i0++;
+        const float vi1 = *i1++;
+        const float vi2 = *i2++;
+        const float vi3 = *i3++;
+        const float vi4 = *i4++;
+        const float vi5 = *i5++;
+        const float vi6 = *i6++;
+        const float vi7 = *i7++;
+        const float vi8 = *i8++;
+
+        const float vsum01 = vi0 + vi1;
+        const float vsum23 = vi2 + vi3;
+        const float vsum45 = vi4 + vi5;
+        const float vsum67 = vi6 + vi7;
+        const float vsum018 = vsum01 + vi8;
+        const float vsum2345 = vsum23 + vsum45;
+        const float vsum01678 = vsum018 + vsum67;
+        const float vsum = vsum2345 + vsum01678;
+
+        *b++ = vsum;
+      } while (--k != 0);
+    }
+
+    size_t m = ks;
+    for (m -= 9; m > 8; m -= 8) {
+      const float* i0 = *input++;
+      const float* i1 = *input++;
+      const float* i2 = *input++;
+      const float* i3 = *input++;
+      const float* i4 = *input++;
+      const float* i5 = *input++;
+      const float* i6 = *input++;
+      const float* i7 = *input++;
+
+      float* b = buffer;
+      size_t k = kc;
+      do {
+        const float vi0 = *i0++;
+        const float vi1 = *i1++;
+        const float vi2 = *i2++;
+        const float vi3 = *i3++;
+        const float vi4 = *i4++;
+        const float vi5 = *i5++;
+        const float vi6 = *i6++;
+        const float vi7 = *i7++;
+        const float vacc = *b;
+
+        const float vsum01 = vi0 + vi1;
+        const float vsum23 = vi2 + vi3;
+        const float vsum45 = vi4 + vi5;
+        const float vsum67 = vi6 + vi7;
+        const float vsum01a = vsum01 + vacc;
+        const float vsum2345 = vsum23 + vsum45;
+        const float vsum0167a = vsum01a + vsum67;
+        const float vsum = vsum2345 + vsum0167a;
+
+        *b++ = vsum;
+      } while (--k != 0);
+    }
+
+    {
+      const float* i0 = input[0];
+      const float* i1 = input[1];
+      const float* i2 = input[2];
+      const float* i3 = input[3];
+      const float* i4 = input[4];
+      const float* i5 = input[5];
+      const float* i6 = input[6];
+      const float* i7 = input[7];
+      input = (const float**) ((uintptr_t) input + input_increment);
+      if (m < 2) {
+        i1 = zero;
+      }
+      if (m <= 2) {
+        i2 = zero;
+      }
+      if (m < 4) {
+        i3 = zero;
+      }
+      if (m <= 4) {
+        i4 = zero;
+      }
+      if (m < 6) {
+        i5 = zero;
+      }
+      if (m <= 6) {
+        i6 = zero;
+      }
+      if (m != 8) {
+        i7 = zero;
+      }
+
+      size_t k = kc;
+      float* b = buffer;
+      do {
+        const float vi0 = *i0++;
+        const float vi1 = *i1++;
+        const float vi2 = *i2++;
+        const float vi3 = *i3++;
+        const float vi4 = *i4++;
+        const float vi5 = *i5++;
+        const float vi6 = *i6++;
+        const float vi7 = *i7++;
+        const float vacc = *b++;
+
+        const float vsum01 = vi0 + vi1;
+        const float vsum23 = vi2 + vi3;
+        const float vsum45 = vi4 + vi5;
+        const float vsum67 = vi6 + vi7;
+        const float vsum01a = vsum01 + vacc;
+        const float vsum2345 = vsum23 + vsum45;
+        const float vsum0167a = vsum01a + vsum67;
+        const float vsum = vsum2345 + vsum0167a;
+
+        float vout = vsum * vmultiplier;
+        vout = __builtin_wasm_max_f32(vout, voutput_min);
+        vout = __builtin_wasm_min_f32(vout, voutput_max);
+
+        *output++ = vout;
+      } while (--k != 0);
+    }
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--n != 0);
+}

diff --git a/src/f32-avgpool/up9-wasm.c b/src/f32-avgpool/up9-wasm.c
new file mode 100644
index 0000000..2f9a444
--- /dev/null
+++ b/src/f32-avgpool/up9-wasm.c

@@ -0,0 +1,97 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/avgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_avgpool_ukernel_up9__wasm(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const float** input,
+    const float* zero,
+    float* output,
+    size_t input_increment,
+    size_t output_increment,
+    const union xnn_f32_avgpool_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(ks != 0);
+  assert(ks <= 9);
+  assert(kc != 0);
+
+  const float vmultiplier = params->scalar.multiplier;
+  const float voutput_min = params->scalar.output_min;
+  const float voutput_max = params->scalar.output_max;
+
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_increment);
+    if (ks < 2) {
+      i1 = zero;
+    }
+    if (ks <= 2) {
+      i2 = zero;
+    }
+    if (ks < 4) {
+      i3 = zero;
+    }
+    if (ks <= 4) {
+      i4 = zero;
+    }
+    if (ks < 6) {
+      i5 = zero;
+    }
+    if (ks <= 6) {
+      i6 = zero;
+    }
+    if (ks < 8) {
+      i7 = zero;
+    }
+    if (ks <= 8) {
+      i8 = zero;
+    }
+
+    size_t k = kc;
+    do {
+      const float vi0 = *i0++;
+      const float vi1 = *i1++;
+      const float vi2 = *i2++;
+      const float vi3 = *i3++;
+      const float vi4 = *i4++;
+      const float vi5 = *i5++;
+      const float vi6 = *i6++;
+      const float vi7 = *i7++;
+      const float vi8 = *i8++;
+
+      const float vsum01 = vi0 + vi1;
+      const float vsum23 = vi2 + vi3;
+      const float vsum45 = vi4 + vi5;
+      const float vsum67 = vi6 + vi7;
+      const float vsum018 = vsum01 + vi8;
+      const float vsum2345 = vsum23 + vsum45;
+      const float vsum01678 = vsum018 + vsum67;
+      const float vsum = vsum2345 + vsum01678;
+
+      float vout = vsum * vmultiplier;
+      vout = __builtin_wasm_max_f32(vout, voutput_min);
+      vout = __builtin_wasm_min_f32(vout, voutput_max);
+
+      *output++ = vout;
+    } while (--k != 0);
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--n != 0);
+}

diff --git a/src/f32-clamp/wasm.c b/src/f32-clamp/wasm.c
new file mode 100644
index 0000000..5dbf6c1
--- /dev/null
+++ b/src/f32-clamp/wasm.c

@@ -0,0 +1,44 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/clamp.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_clamp_ukernel__wasm(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_max = params->scalar.max;
+  const float vy_min = params->scalar.min;
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float vx0 = x[0];
+    const float vx1 = x[1];
+    x += 2;
+
+    float vy0 = __builtin_wasm_max_f32(vx0, vy_min);
+    float vy1 = __builtin_wasm_max_f32(vx1, vy_min);
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if (n != 0) {
+    const float vx = *x;
+    float vy = __builtin_wasm_max_f32(vx, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y = vy;
+  }
+}

diff --git a/src/f32-dwconv/gen/up1x25-wasm-acc2.c b/src/f32-dwconv/gen/up1x25-wasm-acc2.c
new file mode 100644
index 0000000..599629b
--- /dev/null
+++ b/src/f32-dwconv/gen/up1x25-wasm-acc2.c

@@ -0,0 +1,176 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up1x25__wasm_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    do {
+      float vacc0p0 = w[0];
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+
+      const float vi1 = *i1++;
+      const float vk1 = w[2];
+      float vacc0p1 = vi1 * vk1;
+
+      const float vi2 = *i2++;
+      const float vk2 = w[3];
+      vacc0p0 += vi2 * vk2;
+
+      const float vi3 = *i3++;
+      const float vk3 = w[4];
+      vacc0p1 += vi3 * vk3;
+
+      const float vi4 = *i4++;
+      const float vk4 = w[5];
+      vacc0p0 += vi4 * vk4;
+
+      const float vi5 = *i5++;
+      const float vk5 = w[6];
+      vacc0p1 += vi5 * vk5;
+
+      const float vi6 = *i6++;
+      const float vk6 = w[7];
+      vacc0p0 += vi6 * vk6;
+
+      const float vi7 = *i7++;
+      const float vk7 = w[8];
+      vacc0p1 += vi7 * vk7;
+
+      const float vi8 = *i8++;
+      const float vk8 = w[9];
+      vacc0p0 += vi8 * vk8;
+
+      const float vi9 = *i9++;
+      const float vk9 = w[10];
+      vacc0p1 += vi9 * vk9;
+
+      const float vi10 = *i10++;
+      const float vk10 = w[11];
+      vacc0p0 += vi10 * vk10;
+
+      const float vi11 = *i11++;
+      const float vk11 = w[12];
+      vacc0p1 += vi11 * vk11;
+
+      const float vi12 = *i12++;
+      const float vk12 = w[13];
+      vacc0p0 += vi12 * vk12;
+
+      const float vi13 = *i13++;
+      const float vk13 = w[14];
+      vacc0p1 += vi13 * vk13;
+
+      const float vi14 = *i14++;
+      const float vk14 = w[15];
+      vacc0p0 += vi14 * vk14;
+
+      const float vi15 = *i15++;
+      const float vk15 = w[16];
+      vacc0p1 += vi15 * vk15;
+
+      const float vi16 = *i16++;
+      const float vk16 = w[17];
+      vacc0p0 += vi16 * vk16;
+
+      const float vi17 = *i17++;
+      const float vk17 = w[18];
+      vacc0p1 += vi17 * vk17;
+
+      const float vi18 = *i18++;
+      const float vk18 = w[19];
+      vacc0p0 += vi18 * vk18;
+
+      const float vi19 = *i19++;
+      const float vk19 = w[20];
+      vacc0p1 += vi19 * vk19;
+
+      const float vi20 = *i20++;
+      const float vk20 = w[21];
+      vacc0p0 += vi20 * vk20;
+
+      const float vi21 = *i21++;
+      const float vk21 = w[22];
+      vacc0p1 += vi21 * vk21;
+
+      const float vi22 = *i22++;
+      const float vk22 = w[23];
+      vacc0p0 += vi22 * vk22;
+
+      const float vi23 = *i23++;
+      const float vk23 = w[24];
+      vacc0p1 += vi23 * vk23;
+
+      const float vi24 = *i24++;
+      const float vk24 = w[25];
+      vacc0p0 += vi24 * vk24;
+
+      w += 26;
+
+      vacc0p0 += vacc0p1;
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+
+      *output++ = vacc0;
+    } while (--c != 0);
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}

diff --git a/src/f32-dwconv/gen/up1x25-wasm.c b/src/f32-dwconv/gen/up1x25-wasm.c
new file mode 100644
index 0000000..d4b8185
--- /dev/null
+++ b/src/f32-dwconv/gen/up1x25-wasm.c

@@ -0,0 +1,175 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up1x25__wasm(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    do {
+      float vacc0p0 = w[0];
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+
+      const float vi1 = *i1++;
+      const float vk1 = w[2];
+      vacc0p0 += vi1 * vk1;
+
+      const float vi2 = *i2++;
+      const float vk2 = w[3];
+      vacc0p0 += vi2 * vk2;
+
+      const float vi3 = *i3++;
+      const float vk3 = w[4];
+      vacc0p0 += vi3 * vk3;
+
+      const float vi4 = *i4++;
+      const float vk4 = w[5];
+      vacc0p0 += vi4 * vk4;
+
+      const float vi5 = *i5++;
+      const float vk5 = w[6];
+      vacc0p0 += vi5 * vk5;
+
+      const float vi6 = *i6++;
+      const float vk6 = w[7];
+      vacc0p0 += vi6 * vk6;
+
+      const float vi7 = *i7++;
+      const float vk7 = w[8];
+      vacc0p0 += vi7 * vk7;
+
+      const float vi8 = *i8++;
+      const float vk8 = w[9];
+      vacc0p0 += vi8 * vk8;
+
+      const float vi9 = *i9++;
+      const float vk9 = w[10];
+      vacc0p0 += vi9 * vk9;
+
+      const float vi10 = *i10++;
+      const float vk10 = w[11];
+      vacc0p0 += vi10 * vk10;
+
+      const float vi11 = *i11++;
+      const float vk11 = w[12];
+      vacc0p0 += vi11 * vk11;
+
+      const float vi12 = *i12++;
+      const float vk12 = w[13];
+      vacc0p0 += vi12 * vk12;
+
+      const float vi13 = *i13++;
+      const float vk13 = w[14];
+      vacc0p0 += vi13 * vk13;
+
+      const float vi14 = *i14++;
+      const float vk14 = w[15];
+      vacc0p0 += vi14 * vk14;
+
+      const float vi15 = *i15++;
+      const float vk15 = w[16];
+      vacc0p0 += vi15 * vk15;
+
+      const float vi16 = *i16++;
+      const float vk16 = w[17];
+      vacc0p0 += vi16 * vk16;
+
+      const float vi17 = *i17++;
+      const float vk17 = w[18];
+      vacc0p0 += vi17 * vk17;
+
+      const float vi18 = *i18++;
+      const float vk18 = w[19];
+      vacc0p0 += vi18 * vk18;
+
+      const float vi19 = *i19++;
+      const float vk19 = w[20];
+      vacc0p0 += vi19 * vk19;
+
+      const float vi20 = *i20++;
+      const float vk20 = w[21];
+      vacc0p0 += vi20 * vk20;
+
+      const float vi21 = *i21++;
+      const float vk21 = w[22];
+      vacc0p0 += vi21 * vk21;
+
+      const float vi22 = *i22++;
+      const float vk22 = w[23];
+      vacc0p0 += vi22 * vk22;
+
+      const float vi23 = *i23++;
+      const float vk23 = w[24];
+      vacc0p0 += vi23 * vk23;
+
+      const float vi24 = *i24++;
+      const float vk24 = w[25];
+      vacc0p0 += vi24 * vk24;
+
+      w += 26;
+
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+
+      *output++ = vacc0;
+    } while (--c != 0);
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}

diff --git a/src/f32-dwconv/gen/up1x4-wasm-acc2.c b/src/f32-dwconv/gen/up1x4-wasm-acc2.c
new file mode 100644
index 0000000..a54b065
--- /dev/null
+++ b/src/f32-dwconv/gen/up1x4-wasm-acc2.c

@@ -0,0 +1,71 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up1x4__wasm_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    do {
+      float vacc0p0 = w[0];
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+
+      const float vi1 = *i1++;
+      const float vk1 = w[2];
+      float vacc0p1 = vi1 * vk1;
+
+      const float vi2 = *i2++;
+      const float vk2 = w[3];
+      vacc0p0 += vi2 * vk2;
+
+      const float vi3 = *i3++;
+      const float vk3 = w[4];
+      vacc0p1 += vi3 * vk3;
+
+      w += 5;
+
+      vacc0p0 += vacc0p1;
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+
+      *output++ = vacc0;
+    } while (--c != 0);
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}

diff --git a/src/f32-dwconv/gen/up1x4-wasm.c b/src/f32-dwconv/gen/up1x4-wasm.c
new file mode 100644
index 0000000..b0f1e1d
--- /dev/null
+++ b/src/f32-dwconv/gen/up1x4-wasm.c

@@ -0,0 +1,70 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up1x4__wasm(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    do {
+      float vacc0p0 = w[0];
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+
+      const float vi1 = *i1++;
+      const float vk1 = w[2];
+      vacc0p0 += vi1 * vk1;
+
+      const float vi2 = *i2++;
+      const float vk2 = w[3];
+      vacc0p0 += vi2 * vk2;
+
+      const float vi3 = *i3++;
+      const float vk3 = w[4];
+      vacc0p0 += vi3 * vk3;
+
+      w += 5;
+
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+
+      *output++ = vacc0;
+    } while (--c != 0);
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}

diff --git a/src/f32-dwconv/gen/up1x9-wasm-acc2.c b/src/f32-dwconv/gen/up1x9-wasm-acc2.c
new file mode 100644
index 0000000..758a09b
--- /dev/null
+++ b/src/f32-dwconv/gen/up1x9-wasm-acc2.c

@@ -0,0 +1,96 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up1x9__wasm_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    do {
+      float vacc0p0 = w[0];
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+
+      const float vi1 = *i1++;
+      const float vk1 = w[2];
+      float vacc0p1 = vi1 * vk1;
+
+      const float vi2 = *i2++;
+      const float vk2 = w[3];
+      vacc0p0 += vi2 * vk2;
+
+      const float vi3 = *i3++;
+      const float vk3 = w[4];
+      vacc0p1 += vi3 * vk3;
+
+      const float vi4 = *i4++;
+      const float vk4 = w[5];
+      vacc0p0 += vi4 * vk4;
+
+      const float vi5 = *i5++;
+      const float vk5 = w[6];
+      vacc0p1 += vi5 * vk5;
+
+      const float vi6 = *i6++;
+      const float vk6 = w[7];
+      vacc0p0 += vi6 * vk6;
+
+      const float vi7 = *i7++;
+      const float vk7 = w[8];
+      vacc0p1 += vi7 * vk7;
+
+      const float vi8 = *i8++;
+      const float vk8 = w[9];
+      vacc0p0 += vi8 * vk8;
+
+      w += 10;
+
+      vacc0p0 += vacc0p1;
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+
+      *output++ = vacc0;
+    } while (--c != 0);
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}

diff --git a/src/f32-dwconv/gen/up1x9-wasm.c b/src/f32-dwconv/gen/up1x9-wasm.c
new file mode 100644
index 0000000..8292a75
--- /dev/null
+++ b/src/f32-dwconv/gen/up1x9-wasm.c

@@ -0,0 +1,95 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up1x9__wasm(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    do {
+      float vacc0p0 = w[0];
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+
+      const float vi1 = *i1++;
+      const float vk1 = w[2];
+      vacc0p0 += vi1 * vk1;
+
+      const float vi2 = *i2++;
+      const float vk2 = w[3];
+      vacc0p0 += vi2 * vk2;
+
+      const float vi3 = *i3++;
+      const float vk3 = w[4];
+      vacc0p0 += vi3 * vk3;
+
+      const float vi4 = *i4++;
+      const float vk4 = w[5];
+      vacc0p0 += vi4 * vk4;
+
+      const float vi5 = *i5++;
+      const float vk5 = w[6];
+      vacc0p0 += vi5 * vk5;
+
+      const float vi6 = *i6++;
+      const float vk6 = w[7];
+      vacc0p0 += vi6 * vk6;
+
+      const float vi7 = *i7++;
+      const float vk7 = w[8];
+      vacc0p0 += vi7 * vk7;
+
+      const float vi8 = *i8++;
+      const float vk8 = w[9];
+      vacc0p0 += vi8 * vk8;
+
+      w += 10;
+
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+
+      *output++ = vacc0;
+    } while (--c != 0);
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}

diff --git a/src/f32-dwconv/gen/up2x25-wasm-acc2.c b/src/f32-dwconv/gen/up2x25-wasm-acc2.c
new file mode 100644
index 0000000..e411a68
--- /dev/null
+++ b/src/f32-dwconv/gen/up2x25-wasm-acc2.c

@@ -0,0 +1,396 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up2x25__wasm_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 2; c -= 2) {
+      float vacc0p0 = w[0];
+      float vacc1p0 = w[1];
+
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      i0 += 2;
+
+      const float vk0x0 = w[2];
+      vacc0p0 += vi0x0 * vk0x0;
+      const float vk0x1 = w[3];
+      vacc1p0 += vi0x1 * vk0x1;
+
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      i1 += 2;
+
+      const float vk1x0 = w[4];
+      float vacc0p1 = vi1x0 * vk1x0;
+      const float vk1x1 = w[5];
+      float vacc1p1 = vi1x1 * vk1x1;
+
+      const float vi2x0 = i2[0];
+      const float vi2x1 = i2[1];
+      i2 += 2;
+
+      const float vk2x0 = w[6];
+      vacc0p0 += vi2x0 * vk2x0;
+      const float vk2x1 = w[7];
+      vacc1p0 += vi2x1 * vk2x1;
+
+      const float vi3x0 = i3[0];
+      const float vi3x1 = i3[1];
+      i3 += 2;
+
+      const float vk3x0 = w[8];
+      vacc0p1 += vi3x0 * vk3x0;
+      const float vk3x1 = w[9];
+      vacc1p1 += vi3x1 * vk3x1;
+
+      const float vi4x0 = i4[0];
+      const float vi4x1 = i4[1];
+      i4 += 2;
+
+      const float vk4x0 = w[10];
+      vacc0p0 += vi4x0 * vk4x0;
+      const float vk4x1 = w[11];
+      vacc1p0 += vi4x1 * vk4x1;
+
+      const float vi5x0 = i5[0];
+      const float vi5x1 = i5[1];
+      i5 += 2;
+
+      const float vk5x0 = w[12];
+      vacc0p1 += vi5x0 * vk5x0;
+      const float vk5x1 = w[13];
+      vacc1p1 += vi5x1 * vk5x1;
+
+      const float vi6x0 = i6[0];
+      const float vi6x1 = i6[1];
+      i6 += 2;
+
+      const float vk6x0 = w[14];
+      vacc0p0 += vi6x0 * vk6x0;
+      const float vk6x1 = w[15];
+      vacc1p0 += vi6x1 * vk6x1;
+
+      const float vi7x0 = i7[0];
+      const float vi7x1 = i7[1];
+      i7 += 2;
+
+      const float vk7x0 = w[16];
+      vacc0p1 += vi7x0 * vk7x0;
+      const float vk7x1 = w[17];
+      vacc1p1 += vi7x1 * vk7x1;
+
+      const float vi8x0 = i8[0];
+      const float vi8x1 = i8[1];
+      i8 += 2;
+
+      const float vk8x0 = w[18];
+      vacc0p0 += vi8x0 * vk8x0;
+      const float vk8x1 = w[19];
+      vacc1p0 += vi8x1 * vk8x1;
+
+      const float vi9x0 = i9[0];
+      const float vi9x1 = i9[1];
+      i9 += 2;
+
+      const float vk9x0 = w[20];
+      vacc0p1 += vi9x0 * vk9x0;
+      const float vk9x1 = w[21];
+      vacc1p1 += vi9x1 * vk9x1;
+
+      const float vi10x0 = i10[0];
+      const float vi10x1 = i10[1];
+      i10 += 2;
+
+      const float vk10x0 = w[22];
+      vacc0p0 += vi10x0 * vk10x0;
+      const float vk10x1 = w[23];
+      vacc1p0 += vi10x1 * vk10x1;
+
+      const float vi11x0 = i11[0];
+      const float vi11x1 = i11[1];
+      i11 += 2;
+
+      const float vk11x0 = w[24];
+      vacc0p1 += vi11x0 * vk11x0;
+      const float vk11x1 = w[25];
+      vacc1p1 += vi11x1 * vk11x1;
+
+      const float vi12x0 = i12[0];
+      const float vi12x1 = i12[1];
+      i12 += 2;
+
+      const float vk12x0 = w[26];
+      vacc0p0 += vi12x0 * vk12x0;
+      const float vk12x1 = w[27];
+      vacc1p0 += vi12x1 * vk12x1;
+
+      const float vi13x0 = i13[0];
+      const float vi13x1 = i13[1];
+      i13 += 2;
+
+      const float vk13x0 = w[28];
+      vacc0p1 += vi13x0 * vk13x0;
+      const float vk13x1 = w[29];
+      vacc1p1 += vi13x1 * vk13x1;
+
+      const float vi14x0 = i14[0];
+      const float vi14x1 = i14[1];
+      i14 += 2;
+
+      const float vk14x0 = w[30];
+      vacc0p0 += vi14x0 * vk14x0;
+      const float vk14x1 = w[31];
+      vacc1p0 += vi14x1 * vk14x1;
+
+      const float vi15x0 = i15[0];
+      const float vi15x1 = i15[1];
+      i15 += 2;
+
+      const float vk15x0 = w[32];
+      vacc0p1 += vi15x0 * vk15x0;
+      const float vk15x1 = w[33];
+      vacc1p1 += vi15x1 * vk15x1;
+
+      const float vi16x0 = i16[0];
+      const float vi16x1 = i16[1];
+      i16 += 2;
+
+      const float vk16x0 = w[34];
+      vacc0p0 += vi16x0 * vk16x0;
+      const float vk16x1 = w[35];
+      vacc1p0 += vi16x1 * vk16x1;
+
+      const float vi17x0 = i17[0];
+      const float vi17x1 = i17[1];
+      i17 += 2;
+
+      const float vk17x0 = w[36];
+      vacc0p1 += vi17x0 * vk17x0;
+      const float vk17x1 = w[37];
+      vacc1p1 += vi17x1 * vk17x1;
+
+      const float vi18x0 = i18[0];
+      const float vi18x1 = i18[1];
+      i18 += 2;
+
+      const float vk18x0 = w[38];
+      vacc0p0 += vi18x0 * vk18x0;
+      const float vk18x1 = w[39];
+      vacc1p0 += vi18x1 * vk18x1;
+
+      const float vi19x0 = i19[0];
+      const float vi19x1 = i19[1];
+      i19 += 2;
+
+      const float vk19x0 = w[40];
+      vacc0p1 += vi19x0 * vk19x0;
+      const float vk19x1 = w[41];
+      vacc1p1 += vi19x1 * vk19x1;
+
+      const float vi20x0 = i20[0];
+      const float vi20x1 = i20[1];
+      i20 += 2;
+
+      const float vk20x0 = w[42];
+      vacc0p0 += vi20x0 * vk20x0;
+      const float vk20x1 = w[43];
+      vacc1p0 += vi20x1 * vk20x1;
+
+      const float vi21x0 = i21[0];
+      const float vi21x1 = i21[1];
+      i21 += 2;
+
+      const float vk21x0 = w[44];
+      vacc0p1 += vi21x0 * vk21x0;
+      const float vk21x1 = w[45];
+      vacc1p1 += vi21x1 * vk21x1;
+
+      const float vi22x0 = i22[0];
+      const float vi22x1 = i22[1];
+      i22 += 2;
+
+      const float vk22x0 = w[46];
+      vacc0p0 += vi22x0 * vk22x0;
+      const float vk22x1 = w[47];
+      vacc1p0 += vi22x1 * vk22x1;
+
+      const float vi23x0 = i23[0];
+      const float vi23x1 = i23[1];
+      i23 += 2;
+
+      const float vk23x0 = w[48];
+      vacc0p1 += vi23x0 * vk23x0;
+      const float vk23x1 = w[49];
+      vacc1p1 += vi23x1 * vk23x1;
+
+      const float vi24x0 = i24[0];
+      const float vi24x1 = i24[1];
+      i24 += 2;
+
+      const float vk24x0 = w[50];
+      vacc0p0 += vi24x0 * vk24x0;
+      const float vk24x1 = w[51];
+      vacc1p0 += vi24x1 * vk24x1;
+
+      w += 52;
+
+      // Add up all accumulators to vacc01p0
+      vacc0p0 = vacc0p0 + vacc0p1;
+      vacc1p0 = vacc1p0 + vacc1p1;
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      float vacc1 = __builtin_wasm_max_f32(vacc1p0, vmin);
+
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      vacc1 = __builtin_wasm_min_f32(vacc1, vmax);
+
+      output[0] = vacc0;
+      output[1] = vacc1;
+      output += 2;
+    }
+    for (; c >= 1; c -= 1) {
+      float vacc0p0 = *w++;
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+      const float vi1 = *i1++;
+      const float vk1 = w[3];
+      float vacc0p1 = vi1 * vk1;
+      const float vi2 = *i2++;
+      const float vk2 = w[5];
+      vacc0p0 += vi2 * vk2;
+      const float vi3 = *i3++;
+      const float vk3 = w[7];
+      vacc0p1 += vi3 * vk3;
+      const float vi4 = *i4++;
+      const float vk4 = w[9];
+      vacc0p0 += vi4 * vk4;
+      const float vi5 = *i5++;
+      const float vk5 = w[11];
+      vacc0p1 += vi5 * vk5;
+      const float vi6 = *i6++;
+      const float vk6 = w[13];
+      vacc0p0 += vi6 * vk6;
+      const float vi7 = *i7++;
+      const float vk7 = w[15];
+      vacc0p1 += vi7 * vk7;
+      const float vi8 = *i8++;
+      const float vk8 = w[17];
+      vacc0p0 += vi8 * vk8;
+      const float vi9 = *i9++;
+      const float vk9 = w[19];
+      vacc0p1 += vi9 * vk9;
+      const float vi10 = *i10++;
+      const float vk10 = w[21];
+      vacc0p0 += vi10 * vk10;
+      const float vi11 = *i11++;
+      const float vk11 = w[23];
+      vacc0p1 += vi11 * vk11;
+      const float vi12 = *i12++;
+      const float vk12 = w[25];
+      vacc0p0 += vi12 * vk12;
+      const float vi13 = *i13++;
+      const float vk13 = w[27];
+      vacc0p1 += vi13 * vk13;
+      const float vi14 = *i14++;
+      const float vk14 = w[29];
+      vacc0p0 += vi14 * vk14;
+      const float vi15 = *i15++;
+      const float vk15 = w[31];
+      vacc0p1 += vi15 * vk15;
+      const float vi16 = *i16++;
+      const float vk16 = w[33];
+      vacc0p0 += vi16 * vk16;
+      const float vi17 = *i17++;
+      const float vk17 = w[35];
+      vacc0p1 += vi17 * vk17;
+      const float vi18 = *i18++;
+      const float vk18 = w[37];
+      vacc0p0 += vi18 * vk18;
+      const float vi19 = *i19++;
+      const float vk19 = w[39];
+      vacc0p1 += vi19 * vk19;
+      const float vi20 = *i20++;
+      const float vk20 = w[41];
+      vacc0p0 += vi20 * vk20;
+      const float vi21 = *i21++;
+      const float vk21 = w[43];
+      vacc0p1 += vi21 * vk21;
+      const float vi22 = *i22++;
+      const float vk22 = w[45];
+      vacc0p0 += vi22 * vk22;
+      const float vi23 = *i23++;
+      const float vk23 = w[47];
+      vacc0p1 += vi23 * vk23;
+      const float vi24 = *i24++;
+      const float vk24 = w[49];
+      vacc0p0 += vi24 * vk24;
+
+      // Add up all accumulators to vacc01p0
+      vacc0p0 = vacc0p0 + vacc0p1;
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      *output++ = vacc0;
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}

diff --git a/src/f32-dwconv/gen/up2x25-wasm.c b/src/f32-dwconv/gen/up2x25-wasm.c
new file mode 100644
index 0000000..5f82dce
--- /dev/null
+++ b/src/f32-dwconv/gen/up2x25-wasm.c

@@ -0,0 +1,391 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up2x25__wasm(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    const float* i9 = input[9];
+    const float* i10 = input[10];
+    const float* i11 = input[11];
+    const float* i12 = input[12];
+    const float* i13 = input[13];
+    const float* i14 = input[14];
+    const float* i15 = input[15];
+    const float* i16 = input[16];
+    const float* i17 = input[17];
+    const float* i18 = input[18];
+    const float* i19 = input[19];
+    const float* i20 = input[20];
+    const float* i21 = input[21];
+    const float* i22 = input[22];
+    const float* i23 = input[23];
+    const float* i24 = input[24];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 2; c -= 2) {
+      float vacc0p0 = w[0];
+      float vacc1p0 = w[1];
+
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      i0 += 2;
+
+      const float vk0x0 = w[2];
+      vacc0p0 += vi0x0 * vk0x0;
+      const float vk0x1 = w[3];
+      vacc1p0 += vi0x1 * vk0x1;
+
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      i1 += 2;
+
+      const float vk1x0 = w[4];
+      vacc0p0 += vi1x0 * vk1x0;
+      const float vk1x1 = w[5];
+      vacc1p0 += vi1x1 * vk1x1;
+
+      const float vi2x0 = i2[0];
+      const float vi2x1 = i2[1];
+      i2 += 2;
+
+      const float vk2x0 = w[6];
+      vacc0p0 += vi2x0 * vk2x0;
+      const float vk2x1 = w[7];
+      vacc1p0 += vi2x1 * vk2x1;
+
+      const float vi3x0 = i3[0];
+      const float vi3x1 = i3[1];
+      i3 += 2;
+
+      const float vk3x0 = w[8];
+      vacc0p0 += vi3x0 * vk3x0;
+      const float vk3x1 = w[9];
+      vacc1p0 += vi3x1 * vk3x1;
+
+      const float vi4x0 = i4[0];
+      const float vi4x1 = i4[1];
+      i4 += 2;
+
+      const float vk4x0 = w[10];
+      vacc0p0 += vi4x0 * vk4x0;
+      const float vk4x1 = w[11];
+      vacc1p0 += vi4x1 * vk4x1;
+
+      const float vi5x0 = i5[0];
+      const float vi5x1 = i5[1];
+      i5 += 2;
+
+      const float vk5x0 = w[12];
+      vacc0p0 += vi5x0 * vk5x0;
+      const float vk5x1 = w[13];
+      vacc1p0 += vi5x1 * vk5x1;
+
+      const float vi6x0 = i6[0];
+      const float vi6x1 = i6[1];
+      i6 += 2;
+
+      const float vk6x0 = w[14];
+      vacc0p0 += vi6x0 * vk6x0;
+      const float vk6x1 = w[15];
+      vacc1p0 += vi6x1 * vk6x1;
+
+      const float vi7x0 = i7[0];
+      const float vi7x1 = i7[1];
+      i7 += 2;
+
+      const float vk7x0 = w[16];
+      vacc0p0 += vi7x0 * vk7x0;
+      const float vk7x1 = w[17];
+      vacc1p0 += vi7x1 * vk7x1;
+
+      const float vi8x0 = i8[0];
+      const float vi8x1 = i8[1];
+      i8 += 2;
+
+      const float vk8x0 = w[18];
+      vacc0p0 += vi8x0 * vk8x0;
+      const float vk8x1 = w[19];
+      vacc1p0 += vi8x1 * vk8x1;
+
+      const float vi9x0 = i9[0];
+      const float vi9x1 = i9[1];
+      i9 += 2;
+
+      const float vk9x0 = w[20];
+      vacc0p0 += vi9x0 * vk9x0;
+      const float vk9x1 = w[21];
+      vacc1p0 += vi9x1 * vk9x1;
+
+      const float vi10x0 = i10[0];
+      const float vi10x1 = i10[1];
+      i10 += 2;
+
+      const float vk10x0 = w[22];
+      vacc0p0 += vi10x0 * vk10x0;
+      const float vk10x1 = w[23];
+      vacc1p0 += vi10x1 * vk10x1;
+
+      const float vi11x0 = i11[0];
+      const float vi11x1 = i11[1];
+      i11 += 2;
+
+      const float vk11x0 = w[24];
+      vacc0p0 += vi11x0 * vk11x0;
+      const float vk11x1 = w[25];
+      vacc1p0 += vi11x1 * vk11x1;
+
+      const float vi12x0 = i12[0];
+      const float vi12x1 = i12[1];
+      i12 += 2;
+
+      const float vk12x0 = w[26];
+      vacc0p0 += vi12x0 * vk12x0;
+      const float vk12x1 = w[27];
+      vacc1p0 += vi12x1 * vk12x1;
+
+      const float vi13x0 = i13[0];
+      const float vi13x1 = i13[1];
+      i13 += 2;
+
+      const float vk13x0 = w[28];
+      vacc0p0 += vi13x0 * vk13x0;
+      const float vk13x1 = w[29];
+      vacc1p0 += vi13x1 * vk13x1;
+
+      const float vi14x0 = i14[0];
+      const float vi14x1 = i14[1];
+      i14 += 2;
+
+      const float vk14x0 = w[30];
+      vacc0p0 += vi14x0 * vk14x0;
+      const float vk14x1 = w[31];
+      vacc1p0 += vi14x1 * vk14x1;
+
+      const float vi15x0 = i15[0];
+      const float vi15x1 = i15[1];
+      i15 += 2;
+
+      const float vk15x0 = w[32];
+      vacc0p0 += vi15x0 * vk15x0;
+      const float vk15x1 = w[33];
+      vacc1p0 += vi15x1 * vk15x1;
+
+      const float vi16x0 = i16[0];
+      const float vi16x1 = i16[1];
+      i16 += 2;
+
+      const float vk16x0 = w[34];
+      vacc0p0 += vi16x0 * vk16x0;
+      const float vk16x1 = w[35];
+      vacc1p0 += vi16x1 * vk16x1;
+
+      const float vi17x0 = i17[0];
+      const float vi17x1 = i17[1];
+      i17 += 2;
+
+      const float vk17x0 = w[36];
+      vacc0p0 += vi17x0 * vk17x0;
+      const float vk17x1 = w[37];
+      vacc1p0 += vi17x1 * vk17x1;
+
+      const float vi18x0 = i18[0];
+      const float vi18x1 = i18[1];
+      i18 += 2;
+
+      const float vk18x0 = w[38];
+      vacc0p0 += vi18x0 * vk18x0;
+      const float vk18x1 = w[39];
+      vacc1p0 += vi18x1 * vk18x1;
+
+      const float vi19x0 = i19[0];
+      const float vi19x1 = i19[1];
+      i19 += 2;
+
+      const float vk19x0 = w[40];
+      vacc0p0 += vi19x0 * vk19x0;
+      const float vk19x1 = w[41];
+      vacc1p0 += vi19x1 * vk19x1;
+
+      const float vi20x0 = i20[0];
+      const float vi20x1 = i20[1];
+      i20 += 2;
+
+      const float vk20x0 = w[42];
+      vacc0p0 += vi20x0 * vk20x0;
+      const float vk20x1 = w[43];
+      vacc1p0 += vi20x1 * vk20x1;
+
+      const float vi21x0 = i21[0];
+      const float vi21x1 = i21[1];
+      i21 += 2;
+
+      const float vk21x0 = w[44];
+      vacc0p0 += vi21x0 * vk21x0;
+      const float vk21x1 = w[45];
+      vacc1p0 += vi21x1 * vk21x1;
+
+      const float vi22x0 = i22[0];
+      const float vi22x1 = i22[1];
+      i22 += 2;
+
+      const float vk22x0 = w[46];
+      vacc0p0 += vi22x0 * vk22x0;
+      const float vk22x1 = w[47];
+      vacc1p0 += vi22x1 * vk22x1;
+
+      const float vi23x0 = i23[0];
+      const float vi23x1 = i23[1];
+      i23 += 2;
+
+      const float vk23x0 = w[48];
+      vacc0p0 += vi23x0 * vk23x0;
+      const float vk23x1 = w[49];
+      vacc1p0 += vi23x1 * vk23x1;
+
+      const float vi24x0 = i24[0];
+      const float vi24x1 = i24[1];
+      i24 += 2;
+
+      const float vk24x0 = w[50];
+      vacc0p0 += vi24x0 * vk24x0;
+      const float vk24x1 = w[51];
+      vacc1p0 += vi24x1 * vk24x1;
+
+      w += 52;
+
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      float vacc1 = __builtin_wasm_max_f32(vacc1p0, vmin);
+
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      vacc1 = __builtin_wasm_min_f32(vacc1, vmax);
+
+      output[0] = vacc0;
+      output[1] = vacc1;
+      output += 2;
+    }
+    for (; c >= 1; c -= 1) {
+      float vacc0p0 = *w++;
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+      const float vi1 = *i1++;
+      const float vk1 = w[3];
+      vacc0p0 += vi1 * vk1;
+      const float vi2 = *i2++;
+      const float vk2 = w[5];
+      vacc0p0 += vi2 * vk2;
+      const float vi3 = *i3++;
+      const float vk3 = w[7];
+      vacc0p0 += vi3 * vk3;
+      const float vi4 = *i4++;
+      const float vk4 = w[9];
+      vacc0p0 += vi4 * vk4;
+      const float vi5 = *i5++;
+      const float vk5 = w[11];
+      vacc0p0 += vi5 * vk5;
+      const float vi6 = *i6++;
+      const float vk6 = w[13];
+      vacc0p0 += vi6 * vk6;
+      const float vi7 = *i7++;
+      const float vk7 = w[15];
+      vacc0p0 += vi7 * vk7;
+      const float vi8 = *i8++;
+      const float vk8 = w[17];
+      vacc0p0 += vi8 * vk8;
+      const float vi9 = *i9++;
+      const float vk9 = w[19];
+      vacc0p0 += vi9 * vk9;
+      const float vi10 = *i10++;
+      const float vk10 = w[21];
+      vacc0p0 += vi10 * vk10;
+      const float vi11 = *i11++;
+      const float vk11 = w[23];
+      vacc0p0 += vi11 * vk11;
+      const float vi12 = *i12++;
+      const float vk12 = w[25];
+      vacc0p0 += vi12 * vk12;
+      const float vi13 = *i13++;
+      const float vk13 = w[27];
+      vacc0p0 += vi13 * vk13;
+      const float vi14 = *i14++;
+      const float vk14 = w[29];
+      vacc0p0 += vi14 * vk14;
+      const float vi15 = *i15++;
+      const float vk15 = w[31];
+      vacc0p0 += vi15 * vk15;
+      const float vi16 = *i16++;
+      const float vk16 = w[33];
+      vacc0p0 += vi16 * vk16;
+      const float vi17 = *i17++;
+      const float vk17 = w[35];
+      vacc0p0 += vi17 * vk17;
+      const float vi18 = *i18++;
+      const float vk18 = w[37];
+      vacc0p0 += vi18 * vk18;
+      const float vi19 = *i19++;
+      const float vk19 = w[39];
+      vacc0p0 += vi19 * vk19;
+      const float vi20 = *i20++;
+      const float vk20 = w[41];
+      vacc0p0 += vi20 * vk20;
+      const float vi21 = *i21++;
+      const float vk21 = w[43];
+      vacc0p0 += vi21 * vk21;
+      const float vi22 = *i22++;
+      const float vk22 = w[45];
+      vacc0p0 += vi22 * vk22;
+      const float vi23 = *i23++;
+      const float vk23 = w[47];
+      vacc0p0 += vi23 * vk23;
+      const float vi24 = *i24++;
+      const float vk24 = w[49];
+      vacc0p0 += vi24 * vk24;
+
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      *output++ = vacc0;
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}

diff --git a/src/f32-dwconv/gen/up2x4-wasm-acc2.c b/src/f32-dwconv/gen/up2x4-wasm-acc2.c
new file mode 100644
index 0000000..113f270
--- /dev/null
+++ b/src/f32-dwconv/gen/up2x4-wasm-acc2.c

@@ -0,0 +1,123 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up2x4__wasm_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 2; c -= 2) {
+      float vacc0p0 = w[0];
+      float vacc1p0 = w[1];
+
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      i0 += 2;
+
+      const float vk0x0 = w[2];
+      vacc0p0 += vi0x0 * vk0x0;
+      const float vk0x1 = w[3];
+      vacc1p0 += vi0x1 * vk0x1;
+
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      i1 += 2;
+
+      const float vk1x0 = w[4];
+      float vacc0p1 = vi1x0 * vk1x0;
+      const float vk1x1 = w[5];
+      float vacc1p1 = vi1x1 * vk1x1;
+
+      const float vi2x0 = i2[0];
+      const float vi2x1 = i2[1];
+      i2 += 2;
+
+      const float vk2x0 = w[6];
+      vacc0p0 += vi2x0 * vk2x0;
+      const float vk2x1 = w[7];
+      vacc1p0 += vi2x1 * vk2x1;
+
+      const float vi3x0 = i3[0];
+      const float vi3x1 = i3[1];
+      i3 += 2;
+
+      const float vk3x0 = w[8];
+      vacc0p1 += vi3x0 * vk3x0;
+      const float vk3x1 = w[9];
+      vacc1p1 += vi3x1 * vk3x1;
+
+      w += 10;
+
+      // Add up all accumulators to vacc01p0
+      vacc0p0 = vacc0p0 + vacc0p1;
+      vacc1p0 = vacc1p0 + vacc1p1;
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      float vacc1 = __builtin_wasm_max_f32(vacc1p0, vmin);
+
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      vacc1 = __builtin_wasm_min_f32(vacc1, vmax);
+
+      output[0] = vacc0;
+      output[1] = vacc1;
+      output += 2;
+    }
+    for (; c >= 1; c -= 1) {
+      float vacc0p0 = *w++;
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+      const float vi1 = *i1++;
+      const float vk1 = w[3];
+      float vacc0p1 = vi1 * vk1;
+      const float vi2 = *i2++;
+      const float vk2 = w[5];
+      vacc0p0 += vi2 * vk2;
+      const float vi3 = *i3++;
+      const float vk3 = w[7];
+      vacc0p1 += vi3 * vk3;
+
+      // Add up all accumulators to vacc01p0
+      vacc0p0 = vacc0p0 + vacc0p1;
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      *output++ = vacc0;
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}

diff --git a/src/f32-dwconv/gen/up2x4-wasm.c b/src/f32-dwconv/gen/up2x4-wasm.c
new file mode 100644
index 0000000..b1fa49a
--- /dev/null
+++ b/src/f32-dwconv/gen/up2x4-wasm.c

@@ -0,0 +1,118 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up2x4__wasm(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 2; c -= 2) {
+      float vacc0p0 = w[0];
+      float vacc1p0 = w[1];
+
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      i0 += 2;
+
+      const float vk0x0 = w[2];
+      vacc0p0 += vi0x0 * vk0x0;
+      const float vk0x1 = w[3];
+      vacc1p0 += vi0x1 * vk0x1;
+
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      i1 += 2;
+
+      const float vk1x0 = w[4];
+      vacc0p0 += vi1x0 * vk1x0;
+      const float vk1x1 = w[5];
+      vacc1p0 += vi1x1 * vk1x1;
+
+      const float vi2x0 = i2[0];
+      const float vi2x1 = i2[1];
+      i2 += 2;
+
+      const float vk2x0 = w[6];
+      vacc0p0 += vi2x0 * vk2x0;
+      const float vk2x1 = w[7];
+      vacc1p0 += vi2x1 * vk2x1;
+
+      const float vi3x0 = i3[0];
+      const float vi3x1 = i3[1];
+      i3 += 2;
+
+      const float vk3x0 = w[8];
+      vacc0p0 += vi3x0 * vk3x0;
+      const float vk3x1 = w[9];
+      vacc1p0 += vi3x1 * vk3x1;
+
+      w += 10;
+
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      float vacc1 = __builtin_wasm_max_f32(vacc1p0, vmin);
+
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      vacc1 = __builtin_wasm_min_f32(vacc1, vmax);
+
+      output[0] = vacc0;
+      output[1] = vacc1;
+      output += 2;
+    }
+    for (; c >= 1; c -= 1) {
+      float vacc0p0 = *w++;
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+      const float vi1 = *i1++;
+      const float vk1 = w[3];
+      vacc0p0 += vi1 * vk1;
+      const float vi2 = *i2++;
+      const float vk2 = w[5];
+      vacc0p0 += vi2 * vk2;
+      const float vi3 = *i3++;
+      const float vk3 = w[7];
+      vacc0p0 += vi3 * vk3;
+
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      *output++ = vacc0;
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}

diff --git a/src/f32-dwconv/gen/up2x9-wasm-acc2.c b/src/f32-dwconv/gen/up2x9-wasm-acc2.c
new file mode 100644
index 0000000..6411ea6
--- /dev/null
+++ b/src/f32-dwconv/gen/up2x9-wasm-acc2.c

@@ -0,0 +1,188 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up2x9__wasm_acc2(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 2; c -= 2) {
+      float vacc0p0 = w[0];
+      float vacc1p0 = w[1];
+
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      i0 += 2;
+
+      const float vk0x0 = w[2];
+      vacc0p0 += vi0x0 * vk0x0;
+      const float vk0x1 = w[3];
+      vacc1p0 += vi0x1 * vk0x1;
+
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      i1 += 2;
+
+      const float vk1x0 = w[4];
+      float vacc0p1 = vi1x0 * vk1x0;
+      const float vk1x1 = w[5];
+      float vacc1p1 = vi1x1 * vk1x1;
+
+      const float vi2x0 = i2[0];
+      const float vi2x1 = i2[1];
+      i2 += 2;
+
+      const float vk2x0 = w[6];
+      vacc0p0 += vi2x0 * vk2x0;
+      const float vk2x1 = w[7];
+      vacc1p0 += vi2x1 * vk2x1;
+
+      const float vi3x0 = i3[0];
+      const float vi3x1 = i3[1];
+      i3 += 2;
+
+      const float vk3x0 = w[8];
+      vacc0p1 += vi3x0 * vk3x0;
+      const float vk3x1 = w[9];
+      vacc1p1 += vi3x1 * vk3x1;
+
+      const float vi4x0 = i4[0];
+      const float vi4x1 = i4[1];
+      i4 += 2;
+
+      const float vk4x0 = w[10];
+      vacc0p0 += vi4x0 * vk4x0;
+      const float vk4x1 = w[11];
+      vacc1p0 += vi4x1 * vk4x1;
+
+      const float vi5x0 = i5[0];
+      const float vi5x1 = i5[1];
+      i5 += 2;
+
+      const float vk5x0 = w[12];
+      vacc0p1 += vi5x0 * vk5x0;
+      const float vk5x1 = w[13];
+      vacc1p1 += vi5x1 * vk5x1;
+
+      const float vi6x0 = i6[0];
+      const float vi6x1 = i6[1];
+      i6 += 2;
+
+      const float vk6x0 = w[14];
+      vacc0p0 += vi6x0 * vk6x0;
+      const float vk6x1 = w[15];
+      vacc1p0 += vi6x1 * vk6x1;
+
+      const float vi7x0 = i7[0];
+      const float vi7x1 = i7[1];
+      i7 += 2;
+
+      const float vk7x0 = w[16];
+      vacc0p1 += vi7x0 * vk7x0;
+      const float vk7x1 = w[17];
+      vacc1p1 += vi7x1 * vk7x1;
+
+      const float vi8x0 = i8[0];
+      const float vi8x1 = i8[1];
+      i8 += 2;
+
+      const float vk8x0 = w[18];
+      vacc0p0 += vi8x0 * vk8x0;
+      const float vk8x1 = w[19];
+      vacc1p0 += vi8x1 * vk8x1;
+
+      w += 20;
+
+      // Add up all accumulators to vacc01p0
+      vacc0p0 = vacc0p0 + vacc0p1;
+      vacc1p0 = vacc1p0 + vacc1p1;
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      float vacc1 = __builtin_wasm_max_f32(vacc1p0, vmin);
+
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      vacc1 = __builtin_wasm_min_f32(vacc1, vmax);
+
+      output[0] = vacc0;
+      output[1] = vacc1;
+      output += 2;
+    }
+    for (; c >= 1; c -= 1) {
+      float vacc0p0 = *w++;
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+      const float vi1 = *i1++;
+      const float vk1 = w[3];
+      float vacc0p1 = vi1 * vk1;
+      const float vi2 = *i2++;
+      const float vk2 = w[5];
+      vacc0p0 += vi2 * vk2;
+      const float vi3 = *i3++;
+      const float vk3 = w[7];
+      vacc0p1 += vi3 * vk3;
+      const float vi4 = *i4++;
+      const float vk4 = w[9];
+      vacc0p0 += vi4 * vk4;
+      const float vi5 = *i5++;
+      const float vk5 = w[11];
+      vacc0p1 += vi5 * vk5;
+      const float vi6 = *i6++;
+      const float vk6 = w[13];
+      vacc0p0 += vi6 * vk6;
+      const float vi7 = *i7++;
+      const float vk7 = w[15];
+      vacc0p1 += vi7 * vk7;
+      const float vi8 = *i8++;
+      const float vk8 = w[17];
+      vacc0p0 += vi8 * vk8;
+
+      // Add up all accumulators to vacc01p0
+      vacc0p0 = vacc0p0 + vacc0p1;
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      *output++ = vacc0;
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}

diff --git a/src/f32-dwconv/gen/up2x9-wasm.c b/src/f32-dwconv/gen/up2x9-wasm.c
new file mode 100644
index 0000000..d2d71b8
--- /dev/null
+++ b/src/f32-dwconv/gen/up2x9-wasm.c

@@ -0,0 +1,183 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-dwconv/up-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/dwconv.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_dwconv_ukernel_up2x9__wasm(
+    size_t channels,
+    size_t output_width,
+    const float** input,
+    const float* weights,
+    float* output,
+    size_t input_stride,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(channels != 0);
+  assert(output_width != 0);
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_stride);
+
+    size_t c = channels;
+    const float* w = weights;
+    for (; c >= 2; c -= 2) {
+      float vacc0p0 = w[0];
+      float vacc1p0 = w[1];
+
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      i0 += 2;
+
+      const float vk0x0 = w[2];
+      vacc0p0 += vi0x0 * vk0x0;
+      const float vk0x1 = w[3];
+      vacc1p0 += vi0x1 * vk0x1;
+
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      i1 += 2;
+
+      const float vk1x0 = w[4];
+      vacc0p0 += vi1x0 * vk1x0;
+      const float vk1x1 = w[5];
+      vacc1p0 += vi1x1 * vk1x1;
+
+      const float vi2x0 = i2[0];
+      const float vi2x1 = i2[1];
+      i2 += 2;
+
+      const float vk2x0 = w[6];
+      vacc0p0 += vi2x0 * vk2x0;
+      const float vk2x1 = w[7];
+      vacc1p0 += vi2x1 * vk2x1;
+
+      const float vi3x0 = i3[0];
+      const float vi3x1 = i3[1];
+      i3 += 2;
+
+      const float vk3x0 = w[8];
+      vacc0p0 += vi3x0 * vk3x0;
+      const float vk3x1 = w[9];
+      vacc1p0 += vi3x1 * vk3x1;
+
+      const float vi4x0 = i4[0];
+      const float vi4x1 = i4[1];
+      i4 += 2;
+
+      const float vk4x0 = w[10];
+      vacc0p0 += vi4x0 * vk4x0;
+      const float vk4x1 = w[11];
+      vacc1p0 += vi4x1 * vk4x1;
+
+      const float vi5x0 = i5[0];
+      const float vi5x1 = i5[1];
+      i5 += 2;
+
+      const float vk5x0 = w[12];
+      vacc0p0 += vi5x0 * vk5x0;
+      const float vk5x1 = w[13];
+      vacc1p0 += vi5x1 * vk5x1;
+
+      const float vi6x0 = i6[0];
+      const float vi6x1 = i6[1];
+      i6 += 2;
+
+      const float vk6x0 = w[14];
+      vacc0p0 += vi6x0 * vk6x0;
+      const float vk6x1 = w[15];
+      vacc1p0 += vi6x1 * vk6x1;
+
+      const float vi7x0 = i7[0];
+      const float vi7x1 = i7[1];
+      i7 += 2;
+
+      const float vk7x0 = w[16];
+      vacc0p0 += vi7x0 * vk7x0;
+      const float vk7x1 = w[17];
+      vacc1p0 += vi7x1 * vk7x1;
+
+      const float vi8x0 = i8[0];
+      const float vi8x1 = i8[1];
+      i8 += 2;
+
+      const float vk8x0 = w[18];
+      vacc0p0 += vi8x0 * vk8x0;
+      const float vk8x1 = w[19];
+      vacc1p0 += vi8x1 * vk8x1;
+
+      w += 20;
+
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      float vacc1 = __builtin_wasm_max_f32(vacc1p0, vmin);
+
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      vacc1 = __builtin_wasm_min_f32(vacc1, vmax);
+
+      output[0] = vacc0;
+      output[1] = vacc1;
+      output += 2;
+    }
+    for (; c >= 1; c -= 1) {
+      float vacc0p0 = *w++;
+
+      const float vi0 = *i0++;
+      const float vk0 = w[1];
+      vacc0p0 += vi0 * vk0;
+      const float vi1 = *i1++;
+      const float vk1 = w[3];
+      vacc0p0 += vi1 * vk1;
+      const float vi2 = *i2++;
+      const float vk2 = w[5];
+      vacc0p0 += vi2 * vk2;
+      const float vi3 = *i3++;
+      const float vk3 = w[7];
+      vacc0p0 += vi3 * vk3;
+      const float vi4 = *i4++;
+      const float vk4 = w[9];
+      vacc0p0 += vi4 * vk4;
+      const float vi5 = *i5++;
+      const float vk5 = w[11];
+      vacc0p0 += vi5 * vk5;
+      const float vi6 = *i6++;
+      const float vk6 = w[13];
+      vacc0p0 += vi6 * vk6;
+      const float vi7 = *i7++;
+      const float vk7 = w[15];
+      vacc0p0 += vi7 * vk7;
+      const float vi8 = *i8++;
+      const float vk8 = w[17];
+      vacc0p0 += vi8 * vk8;
+
+
+      float vacc0 = __builtin_wasm_max_f32(vacc0p0, vmin);
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      *output++ = vacc0;
+    }
+
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--output_width != 0);
+}

diff --git a/src/f32-dwconv/up-scalar.c.in b/src/f32-dwconv/up-scalar.c.in
index a789bbb..0c09c9b 100644
--- a/src/f32-dwconv/up-scalar.c.in
+++ b/src/f32-dwconv/up-scalar.c.in

@@ -13,7 +13,9 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_dwconv_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__scalar${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
+$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
+$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
+void xnn_f32_dwconv_ukernel_up${CHANNEL_TILE}x${KERNEL_TILE}__${"wasm" if WASM else "scalar"}${"" if ACCUMULATORS == 1 else "_acc%d" % ACCUMULATORS}(
     size_t channels,
     size_t output_width,
     const float** input,
@@ -66,10 +68,10 @@
             $ACC_SLICE *= 2
 
         $for C in range(CHANNEL_TILE):
-          float vacc${C} = math_max_f32(vacc${C}p0, vmin);
+          float vacc${C} = ${MAX_F32}(vacc${C}p0, vmin);
 
         $for C in range(CHANNEL_TILE):
-          vacc${C} = math_min_f32(vacc${C}, vmax);
+          vacc${C} = ${MIN_F32}(vacc${C}, vmax);
 
         $for C in range(CHANNEL_TILE):
           output[${C}] = vacc${C};
@@ -95,8 +97,8 @@
                 vacc0p${A} = vacc0p${A} + vacc0p${A + ACC_SLICE};
             $ACC_SLICE *= 2
 
-        float vacc0 = math_max_f32(vacc0p0, vmin);
-        vacc0 = math_min_f32(vacc0, vmax);
+        float vacc0 = ${MAX_F32}(vacc0p0, vmin);
+        vacc0 = ${MIN_F32}(vacc0, vmax);
         *output++ = vacc0;
       }
     $else:
@@ -120,8 +122,8 @@
               vacc0p${A} += vacc0p${A + ACC_STEP};
           $ACC_STEP *= 2
 
-        float vacc0 = math_max_f32(vacc0p0, vmin);
-        vacc0 = math_min_f32(vacc0, vmax);
+        float vacc0 = ${MAX_F32}(vacc0p0, vmin);
+        vacc0 = ${MIN_F32}(vacc0, vmax);
 
         *output++ = vacc0;
       } while (--c != 0);

diff --git a/src/f32-gavgpool/mp7p7q-wasm.c b/src/f32-gavgpool/mp7p7q-wasm.c
new file mode 100644
index 0000000..d4fa565
--- /dev/null
+++ b/src/f32-gavgpool/mp7p7q-wasm.c

@@ -0,0 +1,148 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_gavgpool_ukernel_mp7p7q__wasm(
+    size_t m,
+    size_t n,
+    const float* input,
+    size_t input_stride,
+    const float* zero,
+    float* buffer,
+    float* output,
+    const union xnn_f32_avgpool_params params[restrict static 1])
+{
+  assert(m > 7);
+  assert(n != 0);
+
+  const float* i0 = input;
+  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
+  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
+  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
+  const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
+  const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
+  const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
+  const size_t input_increment = 7 * input_stride - n * sizeof(float);
+
+  float* b = buffer;
+  size_t k = n;
+  do {
+    const float vi0 = *i0++;
+    const float vi1 = *i1++;
+    const float vi2 = *i2++;
+    const float vi3 = *i3++;
+    const float vi4 = *i4++;
+    const float vi5 = *i5++;
+    const float vi6 = *i6++;
+
+    const float vsum01 = vi0 + vi1;
+    const float vsum23 = vi2 + vi3;
+    const float vsum45 = vi4 + vi5;
+
+    const float vsum016 = vsum01 + vi6;
+    const float vsum2345 = vsum23 + vsum45;
+
+    const float vsum = vsum016 + vsum2345;
+
+    *b++ = vsum;
+  } while (--k != 0);
+  for (m -= 7; m > 7; m -= 7) {
+    b = buffer;
+
+    i0 = (const float*) ((uintptr_t) i0 + input_increment);
+    i1 = (const float*) ((uintptr_t) i1 + input_increment);
+    i2 = (const float*) ((uintptr_t) i2 + input_increment);
+    i3 = (const float*) ((uintptr_t) i3 + input_increment);
+    i4 = (const float*) ((uintptr_t) i4 + input_increment);
+    i5 = (const float*) ((uintptr_t) i5 + input_increment);
+    i6 = (const float*) ((uintptr_t) i6 + input_increment);
+
+    size_t k = n;
+    do {
+      const float vi0 = *i0++;
+      const float vi1 = *i1++;
+      const float vi2 = *i2++;
+      const float vi3 = *i3++;
+      const float vi4 = *i4++;
+      const float vi5 = *i5++;
+      const float vi6 = *i6++;
+      const float vacc = *b;
+
+      const float vsum01 = vi0 + vi1;
+      const float vsum23 = vi2 + vi3;
+      const float vsum45 = vi4 + vi5;
+      const float vsum6a = vi6 + vacc;
+
+      const float vsum0123 = vsum01 + vsum23;
+      const float vsum456a = vsum45 + vsum6a;
+
+      const float vsum = vsum0123 + vsum456a;
+
+      *b++ = vsum;
+    } while (--k != 0);
+  }
+
+  i0 = (const float*) ((uintptr_t) i0 + input_increment);
+  i1 = (const float*) ((uintptr_t) i1 + input_increment);
+  if (m < 2) {
+    i1 = zero;
+  }
+  i2 = (const float*) ((uintptr_t) i2 + input_increment);
+  if (m <= 2) {
+    i2 = zero;
+  }
+  i3 = (const float*) ((uintptr_t) i3 + input_increment);
+  if (m < 4) {
+    i3 = zero;
+  }
+  i4 = (const float*) ((uintptr_t) i4 + input_increment);
+  if (m <= 4) {
+    i4 = zero;
+  }
+  i5 = (const float*) ((uintptr_t) i5 + input_increment);
+  if (m < 6) {
+    i5 = zero;
+  }
+  i6 = (const float*) ((uintptr_t) i6 + input_increment);
+  if (m <= 6) {
+    i6 = zero;
+  }
+  const float vmultiplier = params->scalar.multiplier;
+  const float voutput_min = params->scalar.output_min;
+  const float voutput_max = params->scalar.output_max;
+
+  b = buffer;
+  do {
+    const float vi0 = *i0++;
+    const float vi1 = *i1++;
+    const float vi2 = *i2++;
+    const float vi3 = *i3++;
+    const float vi4 = *i4++;
+    const float vi5 = *i5++;
+    const float vi6 = *i6++;
+    const float vacc = *b++;
+
+    const float vsum01 = vi0 + vi1;
+    const float vsum23 = vi2 + vi3;
+    const float vsum45 = vi4 + vi5;
+    const float vsum6a = vi6 + vacc;
+
+    const float vsum0123 = vsum01 + vsum23;
+    const float vsum456a = vsum45 + vsum6a;
+
+    const float vsum = vsum0123 + vsum456a;
+
+    float vout = vsum * vmultiplier;
+    vout = __builtin_wasm_max_f32(vout, voutput_min);
+    vout = __builtin_wasm_min_f32(vout, voutput_max);
+
+    *output++ = vout;
+  } while (--n != 0);
+}

diff --git a/src/f32-gavgpool/up7-wasm.c b/src/f32-gavgpool/up7-wasm.c
new file mode 100644
index 0000000..89afcdf
--- /dev/null
+++ b/src/f32-gavgpool/up7-wasm.c

@@ -0,0 +1,78 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/gavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_gavgpool_ukernel_up7__wasm(
+    size_t m,
+    size_t n,
+    const float* input,
+    size_t input_stride,
+    const float* zero,
+    float* output,
+    const union xnn_f32_avgpool_params params[restrict static 1])
+{
+  assert(m != 0);
+  assert(m <= 7);
+  assert(n != 0);
+
+  const float* i0 = input;
+  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
+  if (m < 2) {
+    i1 = zero;
+  }
+  const float* i2 = (const float*) ((uintptr_t) i1 + input_stride);
+  if (m <= 2) {
+    i2 = zero;
+  }
+  const float* i3 = (const float*) ((uintptr_t) i2 + input_stride);
+  if (m < 4) {
+    i3 = zero;
+  }
+  const float* i4 = (const float*) ((uintptr_t) i3 + input_stride);
+  if (m <= 4) {
+    i4 = zero;
+  }
+  const float* i5 = (const float*) ((uintptr_t) i4 + input_stride);
+  if (m < 6) {
+    i5 = zero;
+  }
+  const float* i6 = (const float*) ((uintptr_t) i5 + input_stride);
+  if (m <= 6) {
+    i6 = zero;
+  }
+
+  const float vmultiplier = params->scalar.multiplier;
+  const float voutput_min = params->scalar.output_min;
+  const float voutput_max = params->scalar.output_max;
+  do {
+    const float vi0 = *i0++;
+    const float vi1 = *i1++;
+    const float vi2 = *i2++;
+    const float vi3 = *i3++;
+    const float vi4 = *i4++;
+    const float vi5 = *i5++;
+    const float vi6 = *i6++;
+
+    const float vsum01 = vi0 + vi1;
+    const float vsum23 = vi2 + vi3;
+    const float vsum45 = vi4 + vi5;
+
+    const float vsum016 = vsum01 + vi6;
+    const float vsum2345 = vsum23 + vsum45;
+
+    const float vsum = vsum016 + vsum2345;
+
+    float vout = vsum * vmultiplier;
+    vout = __builtin_wasm_max_f32(vout, voutput_min);
+    vout = __builtin_wasm_min_f32(vout, voutput_max);
+
+    *output++ = vout;
+  } while (--n != 0);
+}

diff --git a/src/f32-gemm/gen-inc/1x4-wasm.c b/src/f32-gemm/gen-inc/1x4-wasm.c
new file mode 100644
index 0000000..b615412
--- /dev/null
+++ b/src/f32-gemm/gen-inc/1x4-wasm.c

@@ -0,0 +1,103 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_gemminc_ukernel_1x4__wasm(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float* restrict a,
+    size_t a_stride,
+    const float* restrict w,
+    float* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+
+  do {
+    float vacc00 = acc[0];
+    float vacc01 = acc[1];
+    float vacc02 = acc[2];
+    float vacc03 = acc[3];
+    acc += 4;
+
+    size_t k = kc;
+    do {
+      const float va0 = *a0++;
+
+      const float vb0 = w[0];
+      const float vb1 = w[1];
+      const float vb2 = w[2];
+      const float vb3 = w[3];
+      w += 4;
+
+      vacc00 += va0 * vb0;
+      vacc01 += va0 * vb1;
+      vacc02 += va0 * vb2;
+      vacc03 += va0 * vb3;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const float vmin = params->scalar.min;
+    vacc00 = __builtin_wasm_max_f32(vacc00, vmin);
+    vacc01 = __builtin_wasm_max_f32(vacc01, vmin);
+    vacc02 = __builtin_wasm_max_f32(vacc02, vmin);
+    vacc03 = __builtin_wasm_max_f32(vacc03, vmin);
+
+    const float vmax = params->scalar.max;
+    vacc00 = __builtin_wasm_min_f32(vacc00, vmax);
+    vacc01 = __builtin_wasm_min_f32(vacc01, vmax);
+    vacc02 = __builtin_wasm_min_f32(vacc02, vmax);
+    vacc03 = __builtin_wasm_min_f32(vacc03, vmax);
+
+    if XNN_LIKELY(nc >= 4) {
+      c0[0] = vacc00;
+      c0[1] = vacc01;
+      c0[2] = vacc02;
+      c0[3] = vacc03;
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const void*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        c0[0] = vacc00;
+        c0[1] = vacc01;
+        vacc00 = vacc02;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        c0[0] = vacc00;
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}

diff --git a/src/f32-gemm/gen-inc/2x4-wasm.c b/src/f32-gemm/gen-inc/2x4-wasm.c
new file mode 100644
index 0000000..c362a76
--- /dev/null
+++ b/src/f32-gemm/gen-inc/2x4-wasm.c

@@ -0,0 +1,137 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_gemminc_ukernel_2x4__wasm(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float* restrict a,
+    size_t a_stride,
+    const float* restrict w,
+    float* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    float vacc00 = acc[0];
+    float vacc01 = acc[1];
+    float vacc02 = acc[2];
+    float vacc03 = acc[3];
+    float vacc10 = acc[4];
+    float vacc11 = acc[5];
+    float vacc12 = acc[6];
+    float vacc13 = acc[7];
+    acc += 8;
+
+    size_t k = kc;
+    do {
+      const float va0 = *a0++;
+      const float va1 = *a1++;
+
+      const float vb0 = w[0];
+      const float vb1 = w[1];
+      const float vb2 = w[2];
+      const float vb3 = w[3];
+      w += 4;
+
+      vacc00 += va0 * vb0;
+      vacc01 += va0 * vb1;
+      vacc02 += va0 * vb2;
+      vacc03 += va0 * vb3;
+      vacc10 += va1 * vb0;
+      vacc11 += va1 * vb1;
+      vacc12 += va1 * vb2;
+      vacc13 += va1 * vb3;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const float vmin = params->scalar.min;
+    vacc00 = __builtin_wasm_max_f32(vacc00, vmin);
+    vacc01 = __builtin_wasm_max_f32(vacc01, vmin);
+    vacc02 = __builtin_wasm_max_f32(vacc02, vmin);
+    vacc03 = __builtin_wasm_max_f32(vacc03, vmin);
+    vacc10 = __builtin_wasm_max_f32(vacc10, vmin);
+    vacc11 = __builtin_wasm_max_f32(vacc11, vmin);
+    vacc12 = __builtin_wasm_max_f32(vacc12, vmin);
+    vacc13 = __builtin_wasm_max_f32(vacc13, vmin);
+
+    const float vmax = params->scalar.max;
+    vacc00 = __builtin_wasm_min_f32(vacc00, vmax);
+    vacc01 = __builtin_wasm_min_f32(vacc01, vmax);
+    vacc02 = __builtin_wasm_min_f32(vacc02, vmax);
+    vacc03 = __builtin_wasm_min_f32(vacc03, vmax);
+    vacc10 = __builtin_wasm_min_f32(vacc10, vmax);
+    vacc11 = __builtin_wasm_min_f32(vacc11, vmax);
+    vacc12 = __builtin_wasm_min_f32(vacc12, vmax);
+    vacc13 = __builtin_wasm_min_f32(vacc13, vmax);
+
+    if XNN_LIKELY(nc >= 4) {
+      c1[0] = vacc10;
+      c1[1] = vacc11;
+      c1[2] = vacc12;
+      c1[3] = vacc13;
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      c0[0] = vacc00;
+      c0[1] = vacc01;
+      c0[2] = vacc02;
+      c0[3] = vacc03;
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a1 = (const void*) ((uintptr_t) a1 - kc);
+      a0 = (const void*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        c1[0] = vacc10;
+        c1[1] = vacc11;
+        vacc10 = vacc12;
+        c1 += 2;
+        c0[0] = vacc00;
+        c0[1] = vacc01;
+        vacc00 = vacc02;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        c1[0] = vacc10;
+        c0[0] = vacc00;
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}

diff --git a/src/f32-gemm/gen-inc/4x4-wasm.c b/src/f32-gemm/gen-inc/4x4-wasm.c
new file mode 100644
index 0000000..7522a84
--- /dev/null
+++ b/src/f32-gemm/gen-inc/4x4-wasm.c

@@ -0,0 +1,205 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_gemminc_ukernel_4x4__wasm(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float* restrict a,
+    size_t a_stride,
+    const float* restrict w,
+    float* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const float*restrict acc,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+  assert(acc != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    float vacc00 = acc[0];
+    float vacc01 = acc[1];
+    float vacc02 = acc[2];
+    float vacc03 = acc[3];
+    float vacc10 = acc[4];
+    float vacc11 = acc[5];
+    float vacc12 = acc[6];
+    float vacc13 = acc[7];
+    float vacc20 = acc[8];
+    float vacc21 = acc[9];
+    float vacc22 = acc[10];
+    float vacc23 = acc[11];
+    float vacc30 = acc[12];
+    float vacc31 = acc[13];
+    float vacc32 = acc[14];
+    float vacc33 = acc[15];
+    acc += 16;
+
+    size_t k = kc;
+    do {
+      const float va0 = *a0++;
+      const float va1 = *a1++;
+      const float va2 = *a2++;
+      const float va3 = *a3++;
+
+      const float vb0 = w[0];
+      const float vb1 = w[1];
+      const float vb2 = w[2];
+      const float vb3 = w[3];
+      w += 4;
+
+      vacc00 += va0 * vb0;
+      vacc01 += va0 * vb1;
+      vacc02 += va0 * vb2;
+      vacc03 += va0 * vb3;
+      vacc10 += va1 * vb0;
+      vacc11 += va1 * vb1;
+      vacc12 += va1 * vb2;
+      vacc13 += va1 * vb3;
+      vacc20 += va2 * vb0;
+      vacc21 += va2 * vb1;
+      vacc22 += va2 * vb2;
+      vacc23 += va2 * vb3;
+      vacc30 += va3 * vb0;
+      vacc31 += va3 * vb1;
+      vacc32 += va3 * vb2;
+      vacc33 += va3 * vb3;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const float vmin = params->scalar.min;
+    vacc00 = __builtin_wasm_max_f32(vacc00, vmin);
+    vacc01 = __builtin_wasm_max_f32(vacc01, vmin);
+    vacc02 = __builtin_wasm_max_f32(vacc02, vmin);
+    vacc03 = __builtin_wasm_max_f32(vacc03, vmin);
+    vacc10 = __builtin_wasm_max_f32(vacc10, vmin);
+    vacc11 = __builtin_wasm_max_f32(vacc11, vmin);
+    vacc12 = __builtin_wasm_max_f32(vacc12, vmin);
+    vacc13 = __builtin_wasm_max_f32(vacc13, vmin);
+    vacc20 = __builtin_wasm_max_f32(vacc20, vmin);
+    vacc21 = __builtin_wasm_max_f32(vacc21, vmin);
+    vacc22 = __builtin_wasm_max_f32(vacc22, vmin);
+    vacc23 = __builtin_wasm_max_f32(vacc23, vmin);
+    vacc30 = __builtin_wasm_max_f32(vacc30, vmin);
+    vacc31 = __builtin_wasm_max_f32(vacc31, vmin);
+    vacc32 = __builtin_wasm_max_f32(vacc32, vmin);
+    vacc33 = __builtin_wasm_max_f32(vacc33, vmin);
+
+    const float vmax = params->scalar.max;
+    vacc00 = __builtin_wasm_min_f32(vacc00, vmax);
+    vacc01 = __builtin_wasm_min_f32(vacc01, vmax);
+    vacc02 = __builtin_wasm_min_f32(vacc02, vmax);
+    vacc03 = __builtin_wasm_min_f32(vacc03, vmax);
+    vacc10 = __builtin_wasm_min_f32(vacc10, vmax);
+    vacc11 = __builtin_wasm_min_f32(vacc11, vmax);
+    vacc12 = __builtin_wasm_min_f32(vacc12, vmax);
+    vacc13 = __builtin_wasm_min_f32(vacc13, vmax);
+    vacc20 = __builtin_wasm_min_f32(vacc20, vmax);
+    vacc21 = __builtin_wasm_min_f32(vacc21, vmax);
+    vacc22 = __builtin_wasm_min_f32(vacc22, vmax);
+    vacc23 = __builtin_wasm_min_f32(vacc23, vmax);
+    vacc30 = __builtin_wasm_min_f32(vacc30, vmax);
+    vacc31 = __builtin_wasm_min_f32(vacc31, vmax);
+    vacc32 = __builtin_wasm_min_f32(vacc32, vmax);
+    vacc33 = __builtin_wasm_min_f32(vacc33, vmax);
+
+    if XNN_LIKELY(nc >= 4) {
+      c3[0] = vacc30;
+      c3[1] = vacc31;
+      c3[2] = vacc32;
+      c3[3] = vacc33;
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      c2[0] = vacc20;
+      c2[1] = vacc21;
+      c2[2] = vacc22;
+      c2[3] = vacc23;
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      c1[0] = vacc10;
+      c1[1] = vacc11;
+      c1[2] = vacc12;
+      c1[3] = vacc13;
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      c0[0] = vacc00;
+      c0[1] = vacc01;
+      c0[2] = vacc02;
+      c0[3] = vacc03;
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a3 = (const void*) ((uintptr_t) a3 - kc);
+      a2 = (const void*) ((uintptr_t) a2 - kc);
+      a1 = (const void*) ((uintptr_t) a1 - kc);
+      a0 = (const void*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        c3[0] = vacc30;
+        c3[1] = vacc31;
+        vacc30 = vacc32;
+        c3 += 2;
+        c2[0] = vacc20;
+        c2[1] = vacc21;
+        vacc20 = vacc22;
+        c2 += 2;
+        c1[0] = vacc10;
+        c1[1] = vacc11;
+        vacc10 = vacc12;
+        c1 += 2;
+        c0[0] = vacc00;
+        c0[1] = vacc01;
+        vacc00 = vacc02;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        c3[0] = vacc30;
+        c2[0] = vacc20;
+        c1[0] = vacc10;
+        c0[0] = vacc00;
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}

diff --git a/src/f32-gemm/gen/1x4-wasm.c b/src/f32-gemm/gen/1x4-wasm.c
new file mode 100644
index 0000000..706c06b
--- /dev/null
+++ b/src/f32-gemm/gen/1x4-wasm.c

@@ -0,0 +1,101 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_gemm_ukernel_1x4__wasm(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float* restrict a,
+    size_t a_stride,
+    const float* restrict w,
+    float* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+
+  do {
+    float vacc00 = w[0];
+    float vacc01 = w[1];
+    float vacc02 = w[2];
+    float vacc03 = w[3];
+    w += 4;
+
+    size_t k = kc;
+    do {
+      const float va0 = *a0++;
+
+      const float vb0 = w[0];
+      const float vb1 = w[1];
+      const float vb2 = w[2];
+      const float vb3 = w[3];
+      w += 4;
+
+      vacc00 += va0 * vb0;
+      vacc01 += va0 * vb1;
+      vacc02 += va0 * vb2;
+      vacc03 += va0 * vb3;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const float vmin = params->scalar.min;
+    vacc00 = __builtin_wasm_max_f32(vacc00, vmin);
+    vacc01 = __builtin_wasm_max_f32(vacc01, vmin);
+    vacc02 = __builtin_wasm_max_f32(vacc02, vmin);
+    vacc03 = __builtin_wasm_max_f32(vacc03, vmin);
+
+    const float vmax = params->scalar.max;
+    vacc00 = __builtin_wasm_min_f32(vacc00, vmax);
+    vacc01 = __builtin_wasm_min_f32(vacc01, vmax);
+    vacc02 = __builtin_wasm_min_f32(vacc02, vmax);
+    vacc03 = __builtin_wasm_min_f32(vacc03, vmax);
+
+    if XNN_LIKELY(nc >= 4) {
+      c0[0] = vacc00;
+      c0[1] = vacc01;
+      c0[2] = vacc02;
+      c0[3] = vacc03;
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a0 = (const void*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        c0[0] = vacc00;
+        c0[1] = vacc01;
+        vacc00 = vacc02;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        c0[0] = vacc00;
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}

diff --git a/src/f32-gemm/gen/2x4-wasm.c b/src/f32-gemm/gen/2x4-wasm.c
new file mode 100644
index 0000000..989b9f0
--- /dev/null
+++ b/src/f32-gemm/gen/2x4-wasm.c

@@ -0,0 +1,135 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_gemm_ukernel_2x4__wasm(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float* restrict a,
+    size_t a_stride,
+    const float* restrict w,
+    float* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    float vacc00 = w[0];
+    float vacc01 = w[1];
+    float vacc02 = w[2];
+    float vacc03 = w[3];
+    w += 4;
+    float vacc10 = vacc00;
+    float vacc11 = vacc01;
+    float vacc12 = vacc02;
+    float vacc13 = vacc03;
+
+    size_t k = kc;
+    do {
+      const float va0 = *a0++;
+      const float va1 = *a1++;
+
+      const float vb0 = w[0];
+      const float vb1 = w[1];
+      const float vb2 = w[2];
+      const float vb3 = w[3];
+      w += 4;
+
+      vacc00 += va0 * vb0;
+      vacc01 += va0 * vb1;
+      vacc02 += va0 * vb2;
+      vacc03 += va0 * vb3;
+      vacc10 += va1 * vb0;
+      vacc11 += va1 * vb1;
+      vacc12 += va1 * vb2;
+      vacc13 += va1 * vb3;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const float vmin = params->scalar.min;
+    vacc00 = __builtin_wasm_max_f32(vacc00, vmin);
+    vacc01 = __builtin_wasm_max_f32(vacc01, vmin);
+    vacc02 = __builtin_wasm_max_f32(vacc02, vmin);
+    vacc03 = __builtin_wasm_max_f32(vacc03, vmin);
+    vacc10 = __builtin_wasm_max_f32(vacc10, vmin);
+    vacc11 = __builtin_wasm_max_f32(vacc11, vmin);
+    vacc12 = __builtin_wasm_max_f32(vacc12, vmin);
+    vacc13 = __builtin_wasm_max_f32(vacc13, vmin);
+
+    const float vmax = params->scalar.max;
+    vacc00 = __builtin_wasm_min_f32(vacc00, vmax);
+    vacc01 = __builtin_wasm_min_f32(vacc01, vmax);
+    vacc02 = __builtin_wasm_min_f32(vacc02, vmax);
+    vacc03 = __builtin_wasm_min_f32(vacc03, vmax);
+    vacc10 = __builtin_wasm_min_f32(vacc10, vmax);
+    vacc11 = __builtin_wasm_min_f32(vacc11, vmax);
+    vacc12 = __builtin_wasm_min_f32(vacc12, vmax);
+    vacc13 = __builtin_wasm_min_f32(vacc13, vmax);
+
+    if XNN_LIKELY(nc >= 4) {
+      c1[0] = vacc10;
+      c1[1] = vacc11;
+      c1[2] = vacc12;
+      c1[3] = vacc13;
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      c0[0] = vacc00;
+      c0[1] = vacc01;
+      c0[2] = vacc02;
+      c0[3] = vacc03;
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a1 = (const void*) ((uintptr_t) a1 - kc);
+      a0 = (const void*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        c1[0] = vacc10;
+        c1[1] = vacc11;
+        vacc10 = vacc12;
+        c1 += 2;
+        c0[0] = vacc00;
+        c0[1] = vacc01;
+        vacc00 = vacc02;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        c1[0] = vacc10;
+        c0[0] = vacc00;
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}

diff --git a/src/f32-gemm/gen/4x2-wasm.c b/src/f32-gemm/gen/4x2-wasm.c
new file mode 100644
index 0000000..9ea76ce
--- /dev/null
+++ b/src/f32-gemm/gen/4x2-wasm.c

@@ -0,0 +1,143 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_gemm_ukernel_4x2__wasm(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float* restrict a,
+    size_t a_stride,
+    const float* restrict w,
+    float* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    float vacc00 = w[0];
+    float vacc01 = w[1];
+    w += 2;
+    float vacc10 = vacc00;
+    float vacc11 = vacc01;
+    float vacc20 = vacc00;
+    float vacc21 = vacc01;
+    float vacc30 = vacc00;
+    float vacc31 = vacc01;
+
+    size_t k = kc;
+    do {
+      const float va0 = *a0++;
+      const float va1 = *a1++;
+      const float va2 = *a2++;
+      const float va3 = *a3++;
+
+      const float vb0 = w[0];
+      const float vb1 = w[1];
+      w += 2;
+
+      vacc00 += va0 * vb0;
+      vacc01 += va0 * vb1;
+      vacc10 += va1 * vb0;
+      vacc11 += va1 * vb1;
+      vacc20 += va2 * vb0;
+      vacc21 += va2 * vb1;
+      vacc30 += va3 * vb0;
+      vacc31 += va3 * vb1;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const float vmin = params->scalar.min;
+    vacc00 = __builtin_wasm_max_f32(vacc00, vmin);
+    vacc01 = __builtin_wasm_max_f32(vacc01, vmin);
+    vacc10 = __builtin_wasm_max_f32(vacc10, vmin);
+    vacc11 = __builtin_wasm_max_f32(vacc11, vmin);
+    vacc20 = __builtin_wasm_max_f32(vacc20, vmin);
+    vacc21 = __builtin_wasm_max_f32(vacc21, vmin);
+    vacc30 = __builtin_wasm_max_f32(vacc30, vmin);
+    vacc31 = __builtin_wasm_max_f32(vacc31, vmin);
+
+    const float vmax = params->scalar.max;
+    vacc00 = __builtin_wasm_min_f32(vacc00, vmax);
+    vacc01 = __builtin_wasm_min_f32(vacc01, vmax);
+    vacc10 = __builtin_wasm_min_f32(vacc10, vmax);
+    vacc11 = __builtin_wasm_min_f32(vacc11, vmax);
+    vacc20 = __builtin_wasm_min_f32(vacc20, vmax);
+    vacc21 = __builtin_wasm_min_f32(vacc21, vmax);
+    vacc30 = __builtin_wasm_min_f32(vacc30, vmax);
+    vacc31 = __builtin_wasm_min_f32(vacc31, vmax);
+
+    if XNN_LIKELY(nc >= 2) {
+      c3[0] = vacc30;
+      c3[1] = vacc31;
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      c2[0] = vacc20;
+      c2[1] = vacc21;
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      c1[0] = vacc10;
+      c1[1] = vacc11;
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      c0[0] = vacc00;
+      c0[1] = vacc01;
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a3 = (const void*) ((uintptr_t) a3 - kc);
+      a2 = (const void*) ((uintptr_t) a2 - kc);
+      a1 = (const void*) ((uintptr_t) a1 - kc);
+      a0 = (const void*) ((uintptr_t) a0 - kc);
+
+      nc -= 2;
+    } else {
+      if (nc & 1) {
+        c3[0] = vacc30;
+        c2[0] = vacc20;
+        c1[0] = vacc10;
+        c0[0] = vacc00;
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}

diff --git a/src/f32-gemm/gen/4x4-wasm.c b/src/f32-gemm/gen/4x4-wasm.c
new file mode 100644
index 0000000..3b6e37f
--- /dev/null
+++ b/src/f32-gemm/gen/4x4-wasm.c

@@ -0,0 +1,203 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-gemm/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/gemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_gemm_ukernel_4x4__wasm(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const float* restrict a,
+    size_t a_stride,
+    const float* restrict w,
+    float* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const float* a0 = a;
+  float* c0 = c;
+  const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+  const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
+
+  do {
+    float vacc00 = w[0];
+    float vacc01 = w[1];
+    float vacc02 = w[2];
+    float vacc03 = w[3];
+    w += 4;
+    float vacc10 = vacc00;
+    float vacc11 = vacc01;
+    float vacc12 = vacc02;
+    float vacc13 = vacc03;
+    float vacc20 = vacc00;
+    float vacc21 = vacc01;
+    float vacc22 = vacc02;
+    float vacc23 = vacc03;
+    float vacc30 = vacc00;
+    float vacc31 = vacc01;
+    float vacc32 = vacc02;
+    float vacc33 = vacc03;
+
+    size_t k = kc;
+    do {
+      const float va0 = *a0++;
+      const float va1 = *a1++;
+      const float va2 = *a2++;
+      const float va3 = *a3++;
+
+      const float vb0 = w[0];
+      const float vb1 = w[1];
+      const float vb2 = w[2];
+      const float vb3 = w[3];
+      w += 4;
+
+      vacc00 += va0 * vb0;
+      vacc01 += va0 * vb1;
+      vacc02 += va0 * vb2;
+      vacc03 += va0 * vb3;
+      vacc10 += va1 * vb0;
+      vacc11 += va1 * vb1;
+      vacc12 += va1 * vb2;
+      vacc13 += va1 * vb3;
+      vacc20 += va2 * vb0;
+      vacc21 += va2 * vb1;
+      vacc22 += va2 * vb2;
+      vacc23 += va2 * vb3;
+      vacc30 += va3 * vb0;
+      vacc31 += va3 * vb1;
+      vacc32 += va3 * vb2;
+      vacc33 += va3 * vb3;
+
+      k -= sizeof(float);
+    } while (k != 0);
+
+    const float vmin = params->scalar.min;
+    vacc00 = __builtin_wasm_max_f32(vacc00, vmin);
+    vacc01 = __builtin_wasm_max_f32(vacc01, vmin);
+    vacc02 = __builtin_wasm_max_f32(vacc02, vmin);
+    vacc03 = __builtin_wasm_max_f32(vacc03, vmin);
+    vacc10 = __builtin_wasm_max_f32(vacc10, vmin);
+    vacc11 = __builtin_wasm_max_f32(vacc11, vmin);
+    vacc12 = __builtin_wasm_max_f32(vacc12, vmin);
+    vacc13 = __builtin_wasm_max_f32(vacc13, vmin);
+    vacc20 = __builtin_wasm_max_f32(vacc20, vmin);
+    vacc21 = __builtin_wasm_max_f32(vacc21, vmin);
+    vacc22 = __builtin_wasm_max_f32(vacc22, vmin);
+    vacc23 = __builtin_wasm_max_f32(vacc23, vmin);
+    vacc30 = __builtin_wasm_max_f32(vacc30, vmin);
+    vacc31 = __builtin_wasm_max_f32(vacc31, vmin);
+    vacc32 = __builtin_wasm_max_f32(vacc32, vmin);
+    vacc33 = __builtin_wasm_max_f32(vacc33, vmin);
+
+    const float vmax = params->scalar.max;
+    vacc00 = __builtin_wasm_min_f32(vacc00, vmax);
+    vacc01 = __builtin_wasm_min_f32(vacc01, vmax);
+    vacc02 = __builtin_wasm_min_f32(vacc02, vmax);
+    vacc03 = __builtin_wasm_min_f32(vacc03, vmax);
+    vacc10 = __builtin_wasm_min_f32(vacc10, vmax);
+    vacc11 = __builtin_wasm_min_f32(vacc11, vmax);
+    vacc12 = __builtin_wasm_min_f32(vacc12, vmax);
+    vacc13 = __builtin_wasm_min_f32(vacc13, vmax);
+    vacc20 = __builtin_wasm_min_f32(vacc20, vmax);
+    vacc21 = __builtin_wasm_min_f32(vacc21, vmax);
+    vacc22 = __builtin_wasm_min_f32(vacc22, vmax);
+    vacc23 = __builtin_wasm_min_f32(vacc23, vmax);
+    vacc30 = __builtin_wasm_min_f32(vacc30, vmax);
+    vacc31 = __builtin_wasm_min_f32(vacc31, vmax);
+    vacc32 = __builtin_wasm_min_f32(vacc32, vmax);
+    vacc33 = __builtin_wasm_min_f32(vacc33, vmax);
+
+    if XNN_LIKELY(nc >= 4) {
+      c3[0] = vacc30;
+      c3[1] = vacc31;
+      c3[2] = vacc32;
+      c3[3] = vacc33;
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      c2[0] = vacc20;
+      c2[1] = vacc21;
+      c2[2] = vacc22;
+      c2[3] = vacc23;
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      c1[0] = vacc10;
+      c1[1] = vacc11;
+      c1[2] = vacc12;
+      c1[3] = vacc13;
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      c0[0] = vacc00;
+      c0[1] = vacc01;
+      c0[2] = vacc02;
+      c0[3] = vacc03;
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a3 = (const void*) ((uintptr_t) a3 - kc);
+      a2 = (const void*) ((uintptr_t) a2 - kc);
+      a1 = (const void*) ((uintptr_t) a1 - kc);
+      a0 = (const void*) ((uintptr_t) a0 - kc);
+
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        c3[0] = vacc30;
+        c3[1] = vacc31;
+        vacc30 = vacc32;
+        c3 += 2;
+        c2[0] = vacc20;
+        c2[1] = vacc21;
+        vacc20 = vacc22;
+        c2 += 2;
+        c1[0] = vacc10;
+        c1[1] = vacc11;
+        vacc10 = vacc12;
+        c1 += 2;
+        c0[0] = vacc00;
+        c0[1] = vacc01;
+        vacc00 = vacc02;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        c3[0] = vacc30;
+        c2[0] = vacc20;
+        c1[0] = vacc10;
+        c0[0] = vacc00;
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}

diff --git a/src/f32-gemm/scalar.c.in b/src/f32-gemm/scalar.c.in
index 622f106..90d703f 100644
--- a/src/f32-gemm/scalar.c.in
+++ b/src/f32-gemm/scalar.c.in

@@ -9,7 +9,9 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__scalar(
+$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
+$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
+void xnn_f32_gemm${"inc" if INC else ""}_ukernel_${MR}x${NR}__${"wasm" if WASM else "scalar"}(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -88,12 +90,12 @@
     const float vmin = params->scalar.min;
     $for M in range(MR):
       $for N in range(NR):
-        vacc${M}${N} = math_max_f32(vacc${M}${N}, vmin);
+        vacc${M}${N} = ${MAX_F32}(vacc${M}${N}, vmin);
 
     const float vmax = params->scalar.max;
     $for M in range(MR):
       $for N in range(NR):
-        vacc${M}${N} = math_min_f32(vacc${M}${N}, vmax);
+        vacc${M}${N} = ${MIN_F32}(vacc${M}${N}, vmax);
 
     if XNN_LIKELY(nc >= ${NR}) {
       $for M in reversed(range(MR)):

diff --git a/src/f32-hswish/wasm.c b/src/f32-hswish/wasm.c
new file mode 100644
index 0000000..3a0d2bb
--- /dev/null
+++ b/src/f32-hswish/wasm.c

@@ -0,0 +1,36 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/hswish.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_hswish_ukernel__wasm(
+    size_t n,
+    const float* x,
+    float* y,
+    const union xnn_f32_hswish_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vsixth = params->scalar.sixth;
+  const float vhalf = params->scalar.half;
+  const float vone = params->scalar.one;
+  assert(vhalf == 0.5f);
+  assert(vone == 1.0f);
+
+  do {
+    const float vx = *x++;
+
+    const float vt = __builtin_wasm_min_f32(__builtin_wasm_max_f32(vx * vsixth + vhalf, 0.0f), vone);
+    const float vy = vt * vx;
+
+    *y++ = vy;
+    n -= 4;
+  } while (n != 0);
+}

diff --git a/src/f32-igemm/gen/1x4-wasm.c b/src/f32-igemm/gen/1x4-wasm.c
new file mode 100644
index 0000000..79d8c42
--- /dev/null
+++ b/src/f32-igemm/gen/1x4-wasm.c

@@ -0,0 +1,115 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_igemm_ukernel_1x4__wasm(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+
+  do {
+    float vacc00 = w[0];
+    float vacc01 = w[1];
+    float vacc02 = w[2];
+    float vacc03 = w[3];
+    w += 4;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = kc;
+      do {
+        const float va0 = *a0++;
+
+        const float vb0 = w[0];
+        const float vb1 = w[1];
+        const float vb2 = w[2];
+        const float vb3 = w[3];
+        w += 4;
+
+        vacc00 += va0 * vb0;
+        vacc01 += va0 * vb1;
+        vacc02 += va0 * vb2;
+        vacc03 += va0 * vb3;
+
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const float vmin = params->scalar.min;
+    vacc00 = __builtin_wasm_max_f32(vacc00, vmin);
+    vacc01 = __builtin_wasm_max_f32(vacc01, vmin);
+    vacc02 = __builtin_wasm_max_f32(vacc02, vmin);
+    vacc03 = __builtin_wasm_max_f32(vacc03, vmin);
+
+    const float vmax = params->scalar.max;
+    vacc00 = __builtin_wasm_min_f32(vacc00, vmax);
+    vacc01 = __builtin_wasm_min_f32(vacc01, vmax);
+    vacc02 = __builtin_wasm_min_f32(vacc02, vmax);
+    vacc03 = __builtin_wasm_min_f32(vacc03, vmax);
+
+    if XNN_LIKELY(nc >= 4) {
+      c0[0] = vacc00;
+      c0[1] = vacc01;
+      c0[2] = vacc02;
+      c0[3] = vacc03;
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        c0[0] = vacc00;
+        c0[1] = vacc01;
+        vacc00 = vacc02;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        c0[0] = vacc00;
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}

diff --git a/src/f32-igemm/gen/2x4-wasm.c b/src/f32-igemm/gen/2x4-wasm.c
new file mode 100644
index 0000000..5347636
--- /dev/null
+++ b/src/f32-igemm/gen/2x4-wasm.c

@@ -0,0 +1,151 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_igemm_ukernel_2x4__wasm(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    float vacc00 = w[0];
+    float vacc01 = w[1];
+    float vacc02 = w[2];
+    float vacc03 = w[3];
+    float vacc10 = vacc00;
+    float vacc11 = vacc01;
+    float vacc12 = vacc02;
+    float vacc13 = vacc03;
+    w += 4;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = kc;
+      do {
+        const float va0 = *a0++;
+        const float va1 = *a1++;
+
+        const float vb0 = w[0];
+        const float vb1 = w[1];
+        const float vb2 = w[2];
+        const float vb3 = w[3];
+        w += 4;
+
+        vacc00 += va0 * vb0;
+        vacc01 += va0 * vb1;
+        vacc02 += va0 * vb2;
+        vacc03 += va0 * vb3;
+        vacc10 += va1 * vb0;
+        vacc11 += va1 * vb1;
+        vacc12 += va1 * vb2;
+        vacc13 += va1 * vb3;
+
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const float vmin = params->scalar.min;
+    vacc00 = __builtin_wasm_max_f32(vacc00, vmin);
+    vacc01 = __builtin_wasm_max_f32(vacc01, vmin);
+    vacc02 = __builtin_wasm_max_f32(vacc02, vmin);
+    vacc03 = __builtin_wasm_max_f32(vacc03, vmin);
+    vacc10 = __builtin_wasm_max_f32(vacc10, vmin);
+    vacc11 = __builtin_wasm_max_f32(vacc11, vmin);
+    vacc12 = __builtin_wasm_max_f32(vacc12, vmin);
+    vacc13 = __builtin_wasm_max_f32(vacc13, vmin);
+
+    const float vmax = params->scalar.max;
+    vacc00 = __builtin_wasm_min_f32(vacc00, vmax);
+    vacc01 = __builtin_wasm_min_f32(vacc01, vmax);
+    vacc02 = __builtin_wasm_min_f32(vacc02, vmax);
+    vacc03 = __builtin_wasm_min_f32(vacc03, vmax);
+    vacc10 = __builtin_wasm_min_f32(vacc10, vmax);
+    vacc11 = __builtin_wasm_min_f32(vacc11, vmax);
+    vacc12 = __builtin_wasm_min_f32(vacc12, vmax);
+    vacc13 = __builtin_wasm_min_f32(vacc13, vmax);
+
+    if XNN_LIKELY(nc >= 4) {
+      c1[0] = vacc10;
+      c1[1] = vacc11;
+      c1[2] = vacc12;
+      c1[3] = vacc13;
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      c0[0] = vacc00;
+      c0[1] = vacc01;
+      c0[2] = vacc02;
+      c0[3] = vacc03;
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        c1[0] = vacc10;
+        c1[1] = vacc11;
+        vacc10 = vacc12;
+        c1 += 2;
+        c0[0] = vacc00;
+        c0[1] = vacc01;
+        vacc00 = vacc02;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        c1[0] = vacc10;
+        c0[0] = vacc00;
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}

diff --git a/src/f32-igemm/gen/4x2-wasm.c b/src/f32-igemm/gen/4x2-wasm.c
new file mode 100644
index 0000000..7127072
--- /dev/null
+++ b/src/f32-igemm/gen/4x2-wasm.c

@@ -0,0 +1,163 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_igemm_ukernel_4x2__wasm(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    float vacc00 = w[0];
+    float vacc01 = w[1];
+    float vacc10 = vacc00;
+    float vacc11 = vacc01;
+    float vacc20 = vacc00;
+    float vacc21 = vacc01;
+    float vacc30 = vacc00;
+    float vacc31 = vacc01;
+    w += 2;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      do {
+        const float va0 = *a0++;
+        const float va1 = *a1++;
+        const float va2 = *a2++;
+        const float va3 = *a3++;
+
+        const float vb0 = w[0];
+        const float vb1 = w[1];
+        w += 2;
+
+        vacc00 += va0 * vb0;
+        vacc01 += va0 * vb1;
+        vacc10 += va1 * vb0;
+        vacc11 += va1 * vb1;
+        vacc20 += va2 * vb0;
+        vacc21 += va2 * vb1;
+        vacc30 += va3 * vb0;
+        vacc31 += va3 * vb1;
+
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const float vmin = params->scalar.min;
+    vacc00 = __builtin_wasm_max_f32(vacc00, vmin);
+    vacc01 = __builtin_wasm_max_f32(vacc01, vmin);
+    vacc10 = __builtin_wasm_max_f32(vacc10, vmin);
+    vacc11 = __builtin_wasm_max_f32(vacc11, vmin);
+    vacc20 = __builtin_wasm_max_f32(vacc20, vmin);
+    vacc21 = __builtin_wasm_max_f32(vacc21, vmin);
+    vacc30 = __builtin_wasm_max_f32(vacc30, vmin);
+    vacc31 = __builtin_wasm_max_f32(vacc31, vmin);
+
+    const float vmax = params->scalar.max;
+    vacc00 = __builtin_wasm_min_f32(vacc00, vmax);
+    vacc01 = __builtin_wasm_min_f32(vacc01, vmax);
+    vacc10 = __builtin_wasm_min_f32(vacc10, vmax);
+    vacc11 = __builtin_wasm_min_f32(vacc11, vmax);
+    vacc20 = __builtin_wasm_min_f32(vacc20, vmax);
+    vacc21 = __builtin_wasm_min_f32(vacc21, vmax);
+    vacc30 = __builtin_wasm_min_f32(vacc30, vmax);
+    vacc31 = __builtin_wasm_min_f32(vacc31, vmax);
+
+    if XNN_LIKELY(nc >= 2) {
+      c3[0] = vacc30;
+      c3[1] = vacc31;
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      c2[0] = vacc20;
+      c2[1] = vacc21;
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      c1[0] = vacc10;
+      c1[1] = vacc11;
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      c0[0] = vacc00;
+      c0[1] = vacc01;
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 2;
+    } else {
+      if (nc & 1) {
+        c3[0] = vacc30;
+        c2[0] = vacc20;
+        c1[0] = vacc10;
+        c0[0] = vacc00;
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}

diff --git a/src/f32-igemm/gen/4x4-wasm.c b/src/f32-igemm/gen/4x4-wasm.c
new file mode 100644
index 0000000..cf6da4e
--- /dev/null
+++ b/src/f32-igemm/gen/4x4-wasm.c

@@ -0,0 +1,223 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-igemm/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/igemm.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_igemm_ukernel_4x4__wasm(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const float**restrict a,
+    const float*restrict w,
+    float*restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const float* zero,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(mr != 0);
+  assert(mr <= 4);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(float) == 0);
+  assert(ks != 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(float) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  float* c0 = c;
+  float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
+
+  do {
+    float vacc00 = w[0];
+    float vacc01 = w[1];
+    float vacc02 = w[2];
+    float vacc03 = w[3];
+    float vacc10 = vacc00;
+    float vacc11 = vacc01;
+    float vacc12 = vacc02;
+    float vacc13 = vacc03;
+    float vacc20 = vacc00;
+    float vacc21 = vacc01;
+    float vacc22 = vacc02;
+    float vacc23 = vacc03;
+    float vacc30 = vacc00;
+    float vacc31 = vacc01;
+    float vacc32 = vacc02;
+    float vacc33 = vacc03;
+    w += 4;
+
+    size_t p = ks;
+    do {
+      const float* restrict a0 = a[0];
+      assert(a0 != NULL);
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const float*) ((uintptr_t) a0 + a_offset);
+      }
+      const float* restrict a1 = a[1];
+      assert(a1 != NULL);
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const float*) ((uintptr_t) a1 + a_offset);
+      }
+      const float* restrict a2 = a[2];
+      assert(a2 != NULL);
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const float*) ((uintptr_t) a2 + a_offset);
+      }
+      const float* restrict a3 = a[3];
+      assert(a3 != NULL);
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const float*) ((uintptr_t) a3 + a_offset);
+      }
+      a += 4;
+
+      size_t k = kc;
+      do {
+        const float va0 = *a0++;
+        const float va1 = *a1++;
+        const float va2 = *a2++;
+        const float va3 = *a3++;
+
+        const float vb0 = w[0];
+        const float vb1 = w[1];
+        const float vb2 = w[2];
+        const float vb3 = w[3];
+        w += 4;
+
+        vacc00 += va0 * vb0;
+        vacc01 += va0 * vb1;
+        vacc02 += va0 * vb2;
+        vacc03 += va0 * vb3;
+        vacc10 += va1 * vb0;
+        vacc11 += va1 * vb1;
+        vacc12 += va1 * vb2;
+        vacc13 += va1 * vb3;
+        vacc20 += va2 * vb0;
+        vacc21 += va2 * vb1;
+        vacc22 += va2 * vb2;
+        vacc23 += va2 * vb3;
+        vacc30 += va3 * vb0;
+        vacc31 += va3 * vb1;
+        vacc32 += va3 * vb2;
+        vacc33 += va3 * vb3;
+
+        k -= sizeof(float);
+      } while (k != 0);
+      p -= 4 * sizeof(void*);
+    } while (p != 0);
+
+    const float vmin = params->scalar.min;
+    vacc00 = __builtin_wasm_max_f32(vacc00, vmin);
+    vacc01 = __builtin_wasm_max_f32(vacc01, vmin);
+    vacc02 = __builtin_wasm_max_f32(vacc02, vmin);
+    vacc03 = __builtin_wasm_max_f32(vacc03, vmin);
+    vacc10 = __builtin_wasm_max_f32(vacc10, vmin);
+    vacc11 = __builtin_wasm_max_f32(vacc11, vmin);
+    vacc12 = __builtin_wasm_max_f32(vacc12, vmin);
+    vacc13 = __builtin_wasm_max_f32(vacc13, vmin);
+    vacc20 = __builtin_wasm_max_f32(vacc20, vmin);
+    vacc21 = __builtin_wasm_max_f32(vacc21, vmin);
+    vacc22 = __builtin_wasm_max_f32(vacc22, vmin);
+    vacc23 = __builtin_wasm_max_f32(vacc23, vmin);
+    vacc30 = __builtin_wasm_max_f32(vacc30, vmin);
+    vacc31 = __builtin_wasm_max_f32(vacc31, vmin);
+    vacc32 = __builtin_wasm_max_f32(vacc32, vmin);
+    vacc33 = __builtin_wasm_max_f32(vacc33, vmin);
+
+    const float vmax = params->scalar.max;
+    vacc00 = __builtin_wasm_min_f32(vacc00, vmax);
+    vacc01 = __builtin_wasm_min_f32(vacc01, vmax);
+    vacc02 = __builtin_wasm_min_f32(vacc02, vmax);
+    vacc03 = __builtin_wasm_min_f32(vacc03, vmax);
+    vacc10 = __builtin_wasm_min_f32(vacc10, vmax);
+    vacc11 = __builtin_wasm_min_f32(vacc11, vmax);
+    vacc12 = __builtin_wasm_min_f32(vacc12, vmax);
+    vacc13 = __builtin_wasm_min_f32(vacc13, vmax);
+    vacc20 = __builtin_wasm_min_f32(vacc20, vmax);
+    vacc21 = __builtin_wasm_min_f32(vacc21, vmax);
+    vacc22 = __builtin_wasm_min_f32(vacc22, vmax);
+    vacc23 = __builtin_wasm_min_f32(vacc23, vmax);
+    vacc30 = __builtin_wasm_min_f32(vacc30, vmax);
+    vacc31 = __builtin_wasm_min_f32(vacc31, vmax);
+    vacc32 = __builtin_wasm_min_f32(vacc32, vmax);
+    vacc33 = __builtin_wasm_min_f32(vacc33, vmax);
+
+    if XNN_LIKELY(nc >= 4) {
+      c3[0] = vacc30;
+      c3[1] = vacc31;
+      c3[2] = vacc32;
+      c3[3] = vacc33;
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
+      c2[0] = vacc20;
+      c2[1] = vacc21;
+      c2[2] = vacc22;
+      c2[3] = vacc23;
+      c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      c1[0] = vacc10;
+      c1[1] = vacc11;
+      c1[2] = vacc12;
+      c1[3] = vacc13;
+      c1 = (float*) ((uintptr_t) c1 + cn_stride);
+      c0[0] = vacc00;
+      c0[1] = vacc01;
+      c0[2] = vacc02;
+      c0[3] = vacc03;
+      c0 = (float*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const float**restrict) ((uintptr_t) a - ks);
+      nc -= 4;
+    } else {
+      if (nc & 2) {
+        c3[0] = vacc30;
+        c3[1] = vacc31;
+        vacc30 = vacc32;
+        c3 += 2;
+        c2[0] = vacc20;
+        c2[1] = vacc21;
+        vacc20 = vacc22;
+        c2 += 2;
+        c1[0] = vacc10;
+        c1[1] = vacc11;
+        vacc10 = vacc12;
+        c1 += 2;
+        c0[0] = vacc00;
+        c0[1] = vacc01;
+        vacc00 = vacc02;
+        c0 += 2;
+      }
+      if (nc & 1) {
+        c3[0] = vacc30;
+        c2[0] = vacc20;
+        c1[0] = vacc10;
+        c0[0] = vacc00;
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}

diff --git a/src/f32-igemm/scalar.c.in b/src/f32-igemm/scalar.c.in
index e4a8161..d713f51 100644
--- a/src/f32-igemm/scalar.c.in
+++ b/src/f32-igemm/scalar.c.in

@@ -9,7 +9,9 @@
 #include <xnnpack/math.h>
 
 
-void xnn_f32_igemm_ukernel_${MR}x${NR}__scalar(
+$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
+$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
+void xnn_f32_igemm_ukernel_${MR}x${NR}__${"wasm" if WASM else "scalar"}(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -90,12 +92,12 @@
     const float vmin = params->scalar.min;
     $for M in range(MR):
       $for N in range(NR):
-        vacc${M}${N} = math_max_f32(vacc${M}${N}, vmin);
+        vacc${M}${N} = ${MAX_F32}(vacc${M}${N}, vmin);
 
     const float vmax = params->scalar.max;
     $for M in range(MR):
       $for N in range(NR):
-        vacc${M}${N} = math_min_f32(vacc${M}${N}, vmax);
+        vacc${M}${N} = ${MIN_F32}(vacc${M}${N}, vmax);
 
     if XNN_LIKELY(nc >= ${NR}) {
       $for M in reversed(range(MR)):

diff --git a/src/f32-maxpool/9p8x-wasm-c1.c b/src/f32-maxpool/9p8x-wasm-c1.c
new file mode 100644
index 0000000..fcad9e5
--- /dev/null
+++ b/src/f32-maxpool/9p8x-wasm-c1.c

@@ -0,0 +1,173 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/maxpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_maxpool_ukernel_9p8x__wasm_c1(
+    size_t output_pixels,
+    size_t kernel_elements,
+    size_t channels,
+    const float** input,
+    size_t input_offset,
+    float* output,
+    size_t input_increment,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(output_pixels != 0);
+  assert(kernel_elements != 0);
+  assert(channels != 0);
+
+  const float voutput_min = params->scalar.min;
+  const float voutput_max = params->scalar.max;
+  do {
+    float* o = output;
+    {
+      const float* i0 = *input++;
+      const float* i1 = *input++;
+      const float* i2 = *input++;
+      const float* i3 = *input++;
+      const float* i4 = *input++;
+      const float* i5 = *input++;
+      const float* i6 = *input++;
+      const float* i7 = *input++;
+      const float* i8 = *input++;
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      i8 = (const float*) ((uintptr_t) i8 + input_offset);
+      if (kernel_elements < 2) {
+        i1 = i0;
+      }
+      if (kernel_elements <= 2) {
+        i2 = i0;
+      }
+      if (kernel_elements < 4) {
+        i3 = i0;
+      }
+      if (kernel_elements <= 4) {
+        i4 = i0;
+      }
+      if (kernel_elements < 6) {
+        i5 = i0;
+      }
+      if (kernel_elements <= 6) {
+        i6 = i0;
+      }
+      if (kernel_elements < 8) {
+        i7 = i0;
+      }
+      if (kernel_elements <= 8) {
+        i8 = i0;
+      }
+
+      size_t c = channels;
+      do {
+        const float vi0 = *i0++;
+        const float vi1 = *i1++;
+        const float vi2 = *i2++;
+        const float vi3 = *i3++;
+        const float vi4 = *i4++;
+        const float vi5 = *i5++;
+        const float vi6 = *i6++;
+        const float vi7 = *i7++;
+        const float vi8 = *i8++;
+
+        const float vmax01 = __builtin_wasm_max_f32(vi0, vi1);
+        const float vmax23 = __builtin_wasm_max_f32(vi2, vi3);
+        const float vmax45 = __builtin_wasm_max_f32(vi4, vi5);
+        const float vmax67 = __builtin_wasm_max_f32(vi6, vi7);
+        const float vmax018 = __builtin_wasm_max_f32(vmax01, vi8);
+
+        const float vmax2345 = __builtin_wasm_max_f32(vmax23, vmax45);
+        const float vmax01678 = __builtin_wasm_max_f32(vmax018, vmax67);
+        float vout = __builtin_wasm_max_f32(vmax2345, vmax01678);
+        vout = __builtin_wasm_max_f32(vout, voutput_min);
+        vout = __builtin_wasm_min_f32(vout, voutput_max);
+
+        *o++ = vout;
+      } while (--c != 0);
+    }
+
+    for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
+      const float* i0 = *input++;
+      const float* i1 = *input++;
+      const float* i2 = *input++;
+      const float* i3 = *input++;
+      const float* i4 = *input++;
+      const float* i5 = *input++;
+      const float* i6 = *input++;
+      const float* i7 = *input++;
+      i0 = (const float*) ((uintptr_t) i0 + input_offset);
+      i1 = (const float*) ((uintptr_t) i1 + input_offset);
+      i2 = (const float*) ((uintptr_t) i2 + input_offset);
+      i3 = (const float*) ((uintptr_t) i3 + input_offset);
+      i4 = (const float*) ((uintptr_t) i4 + input_offset);
+      i5 = (const float*) ((uintptr_t) i5 + input_offset);
+      i6 = (const float*) ((uintptr_t) i6 + input_offset);
+      i7 = (const float*) ((uintptr_t) i7 + input_offset);
+      if (k < 2) {
+        i1 = i0;
+      }
+      if (k <= 2) {
+        i2 = i0;
+      }
+      if (k < 4) {
+        i3 = i0;
+      }
+      if (k <= 4) {
+        i4 = i0;
+      }
+      if (k < 6) {
+        i5 = i0;
+      }
+      if (k <= 6) {
+        i6 = i0;
+      }
+      if (k < 8) {
+        i7 = i0;
+      }
+
+      o = output;
+      size_t c = channels;
+      do {
+        const float vi0 = *i0++;
+        const float vi1 = *i1++;
+        const float vi2 = *i2++;
+        const float vi3 = *i3++;
+        const float vi4 = *i4++;
+        const float vi5 = *i5++;
+        const float vi6 = *i6++;
+        const float vi7 = *i7++;
+        const float vi8 = *o;
+
+        const float vmax01 = __builtin_wasm_max_f32(vi0, vi1);
+        const float vmax23 = __builtin_wasm_max_f32(vi2, vi3);
+        const float vmax45 = __builtin_wasm_max_f32(vi4, vi5);
+        const float vmax67 = __builtin_wasm_max_f32(vi6, vi7);
+        const float vmax018 = __builtin_wasm_max_f32(vmax01, vi8);
+
+        const float vmax2345 = __builtin_wasm_max_f32(vmax23, vmax45);
+        const float vmax01678 = __builtin_wasm_max_f32(vmax018, vmax67);
+        float vout = __builtin_wasm_max_f32(vmax2345, vmax01678);
+        vout = __builtin_wasm_max_f32(vout, voutput_min);
+        vout = __builtin_wasm_min_f32(vout, voutput_max);
+
+        *o++ = vout;
+      } while (--c != 0);
+    }
+    input = (const float**) ((uintptr_t) input + input_increment);
+    output = (float*) ((uintptr_t) o + output_increment);
+  } while (--output_pixels != 0);
+}

diff --git a/src/f32-pavgpool/mp9p8q-wasm.c b/src/f32-pavgpool/mp9p8q-wasm.c
new file mode 100644
index 0000000..479f0a6
--- /dev/null
+++ b/src/f32-pavgpool/mp9p8q-wasm.c

@@ -0,0 +1,172 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/pavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_pavgpool_ukernel_mp9p8q__wasm(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const float** input,
+    const float* zero,
+    const float* multiplier,
+    float* buffer,
+    float* output,
+    size_t input_increment,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(ks > 9);
+  assert(kc != 0);
+
+  const float voutput_min = params->scalar.min;
+  const float voutput_max = params->scalar.max;
+
+  do {
+    {
+      const float* i0 = *input++;
+      const float* i1 = *input++;
+      const float* i2 = *input++;
+      const float* i3 = *input++;
+      const float* i4 = *input++;
+      const float* i5 = *input++;
+      const float* i6 = *input++;
+      const float* i7 = *input++;
+      const float* i8 = *input++;
+
+      float* b = buffer;
+      size_t k = kc;
+      do {
+        const float vi0 = *i0++;
+        const float vi1 = *i1++;
+        const float vi2 = *i2++;
+        const float vi3 = *i3++;
+        const float vi4 = *i4++;
+        const float vi5 = *i5++;
+        const float vi6 = *i6++;
+        const float vi7 = *i7++;
+        const float vi8 = *i8++;
+
+        const float vsum01 = vi0 + vi1;
+        const float vsum23 = vi2 + vi3;
+        const float vsum45 = vi4 + vi5;
+        const float vsum67 = vi6 + vi7;
+        const float vsum018 = vsum01 + vi8;
+        const float vsum2345 = vsum23 + vsum45;
+        const float vsum01678 = vsum018 + vsum67;
+        const float vsum = vsum2345 + vsum01678;
+
+        *b++ = vsum;
+      } while (--k != 0);
+    }
+
+    size_t m = ks;
+    for (m -= 9; m > 8; m -= 8) {
+      const float* i0 = *input++;
+      const float* i1 = *input++;
+      const float* i2 = *input++;
+      const float* i3 = *input++;
+      const float* i4 = *input++;
+      const float* i5 = *input++;
+      const float* i6 = *input++;
+      const float* i7 = *input++;
+
+      float* b = buffer;
+      size_t k = kc;
+      do {
+        const float vi0 = *i0++;
+        const float vi1 = *i1++;
+        const float vi2 = *i2++;
+        const float vi3 = *i3++;
+        const float vi4 = *i4++;
+        const float vi5 = *i5++;
+        const float vi6 = *i6++;
+        const float vi7 = *i7++;
+        const float vacc = *b;
+
+        const float vsum01 = vi0 + vi1;
+        const float vsum23 = vi2 + vi3;
+        const float vsum45 = vi4 + vi5;
+        const float vsum67 = vi6 + vi7;
+        const float vsum01a = vsum01 + vacc;
+        const float vsum2345 = vsum23 + vsum45;
+        const float vsum0167a = vsum01a + vsum67;
+        const float vsum = vsum2345 + vsum0167a;
+
+        *b++ = vsum;
+      } while (--k != 0);
+    }
+
+    {
+      const float* i0 = input[0];
+      const float* i1 = input[1];
+      const float* i2 = input[2];
+      const float* i3 = input[3];
+      const float* i4 = input[4];
+      const float* i5 = input[5];
+      const float* i6 = input[6];
+      const float* i7 = input[7];
+      input = (const float**) ((uintptr_t) input + input_increment);
+      if (m < 2) {
+        i1 = zero;
+      }
+      if (m <= 2) {
+        i2 = zero;
+      }
+      if (m < 4) {
+        i3 = zero;
+      }
+      if (m <= 4) {
+        i4 = zero;
+      }
+      if (m < 6) {
+        i5 = zero;
+      }
+      if (m <= 6) {
+        i6 = zero;
+      }
+      if (m != 8) {
+        i7 = zero;
+      }
+
+      const float vmultiplier = *multiplier++;
+
+      size_t k = kc;
+      float* b = buffer;
+      do {
+        const float vi0 = *i0++;
+        const float vi1 = *i1++;
+        const float vi2 = *i2++;
+        const float vi3 = *i3++;
+        const float vi4 = *i4++;
+        const float vi5 = *i5++;
+        const float vi6 = *i6++;
+        const float vi7 = *i7++;
+        const float vacc = *b++;
+
+        const float vsum01 = vi0 + vi1;
+        const float vsum23 = vi2 + vi3;
+        const float vsum45 = vi4 + vi5;
+        const float vsum67 = vi6 + vi7;
+        const float vsum01a = vsum01 + vacc;
+        const float vsum2345 = vsum23 + vsum45;
+        const float vsum0167a = vsum01a + vsum67;
+        const float vsum = vsum2345 + vsum0167a;
+
+        float vout = vsum * vmultiplier;
+        vout = __builtin_wasm_max_f32(vout, voutput_min);
+        vout = __builtin_wasm_min_f32(vout, voutput_max);
+
+        *output++ = vout;
+      } while (--k != 0);
+    }
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--n != 0);
+}

diff --git a/src/f32-pavgpool/up9-wasm.c b/src/f32-pavgpool/up9-wasm.c
new file mode 100644
index 0000000..6932907
--- /dev/null
+++ b/src/f32-pavgpool/up9-wasm.c

@@ -0,0 +1,99 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/pavgpool.h>
+#include <xnnpack/math.h>
+
+
+void xnn_f32_pavgpool_ukernel_up9__wasm(
+    size_t n,
+    size_t ks,
+    size_t kc,
+    const float** input,
+    const float* zero,
+    const float* multiplier,
+    float* output,
+    size_t input_increment,
+    size_t output_increment,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(ks != 0);
+  assert(ks <= 9);
+  assert(kc != 0);
+
+  const float voutput_min = params->scalar.min;
+  const float voutput_max = params->scalar.max;
+
+  do {
+    const float* i0 = input[0];
+    const float* i1 = input[1];
+    const float* i2 = input[2];
+    const float* i3 = input[3];
+    const float* i4 = input[4];
+    const float* i5 = input[5];
+    const float* i6 = input[6];
+    const float* i7 = input[7];
+    const float* i8 = input[8];
+    input = (const float**) ((uintptr_t) input + input_increment);
+    if (ks < 2) {
+      i1 = zero;
+    }
+    if (ks <= 2) {
+      i2 = zero;
+    }
+    if (ks < 4) {
+      i3 = zero;
+    }
+    if (ks <= 4) {
+      i4 = zero;
+    }
+    if (ks < 6) {
+      i5 = zero;
+    }
+    if (ks <= 6) {
+      i6 = zero;
+    }
+    if (ks < 8) {
+      i7 = zero;
+    }
+    if (ks <= 8) {
+      i8 = zero;
+    }
+
+    const float vmultiplier = *multiplier++;
+
+    size_t k = kc;
+    do {
+      const float vi0 = *i0++;
+      const float vi1 = *i1++;
+      const float vi2 = *i2++;
+      const float vi3 = *i3++;
+      const float vi4 = *i4++;
+      const float vi5 = *i5++;
+      const float vi6 = *i6++;
+      const float vi7 = *i7++;
+      const float vi8 = *i8++;
+
+      const float vsum01 = vi0 + vi1;
+      const float vsum23 = vi2 + vi3;
+      const float vsum45 = vi4 + vi5;
+      const float vsum67 = vi6 + vi7;
+      const float vsum018 = vsum01 + vi8;
+      const float vsum2345 = vsum23 + vsum45;
+      const float vsum01678 = vsum018 + vsum67;
+      const float vsum = vsum2345 + vsum01678;
+
+      float vout = vsum * vmultiplier;
+      vout = __builtin_wasm_max_f32(vout, voutput_min);
+      vout = __builtin_wasm_min_f32(vout, voutput_max);
+
+      *output++ = vout;
+    } while (--k != 0);
+    output = (float*) ((uintptr_t) output + output_increment);
+  } while (--n != 0);
+}

diff --git a/src/f32-prelu/gen/wasm-2x1.c b/src/f32-prelu/gen/wasm-2x1.c
new file mode 100644
index 0000000..30dceff
--- /dev/null
+++ b/src/f32-prelu/gen/wasm-2x1.c

@@ -0,0 +1,79 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-prelu/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <math.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/prelu.h>
+
+
+void xnn_f32_prelu_ukernel__wasm_2x1(
+    size_t rows,
+    size_t channels,
+    const float*restrict input,
+    size_t input_stride,
+    const float*restrict weights,
+    float*restrict output,
+    size_t output_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(rows != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+
+  const float* i0 = input;
+  float* o0 = output;
+  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
+  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = i0;
+    o1 = o0;
+  }
+
+  const size_t input_increment = input_stride * 2 - channels;
+  const size_t output_increment = output_stride * 2 - channels;
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* w = weights;
+    size_t c = channels;
+    do {
+      const float vw = *w++;
+
+      const float vi0 = *i0++;
+      const float vi1 = *i1++;
+
+      float vacc0 = signbit(vi0) ? vi0 * vw : vi0;
+      float vacc1 = signbit(vi1) ? vi1 * vw : vi1;
+
+      vacc0 = __builtin_wasm_max_f32(vacc0, vmin);
+      vacc1 = __builtin_wasm_max_f32(vacc1, vmin);
+
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      vacc1 = __builtin_wasm_min_f32(vacc1, vmax);
+
+      *o0++ = vacc0;
+      *o1++ = vacc1;
+
+      c -= sizeof(float);
+    } while (c != 0);
+    i0 = (const float*) ((uintptr_t) i0 + input_increment);
+    o0 = (float*) ((uintptr_t) o0 + output_increment);
+    i1 = (const float*) ((uintptr_t) i1 + input_increment);
+    o1 = (float*) ((uintptr_t) o1 + output_increment);
+    if XNN_UNPREDICTABLE(rows < 4) {
+      i1 = i0;
+      o1 = o0;
+    }
+    rows = doz(rows, 2);
+  } while (rows != 0);
+}

diff --git a/src/f32-prelu/gen/wasm-2x4.c b/src/f32-prelu/gen/wasm-2x4.c
new file mode 100644
index 0000000..9ec1380
--- /dev/null
+++ b/src/f32-prelu/gen/wasm-2x4.c

@@ -0,0 +1,134 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-prelu/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <math.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/prelu.h>
+
+
+void xnn_f32_prelu_ukernel__wasm_2x4(
+    size_t rows,
+    size_t channels,
+    const float*restrict input,
+    size_t input_stride,
+    const float*restrict weights,
+    float*restrict output,
+    size_t output_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(rows != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+
+  const float* i0 = input;
+  float* o0 = output;
+  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
+  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = i0;
+    o1 = o0;
+  }
+
+  const size_t input_increment = input_stride * 2 - channels;
+  const size_t output_increment = output_stride * 2 - channels;
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* w = weights;
+    size_t c = channels;
+    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
+      const float vw0 = w[0];
+      const float vw1 = w[1];
+      const float vw2 = w[2];
+      const float vw3 = w[3];
+
+      const float vi0x0 = i0[0];
+      const float vi0x1 = i0[1];
+      const float vi0x2 = i0[2];
+      const float vi0x3 = i0[3];
+      i0 += 4;
+      const float vi1x0 = i1[0];
+      const float vi1x1 = i1[1];
+      const float vi1x2 = i1[2];
+      const float vi1x3 = i1[3];
+      i1 += 4;
+
+      float vacc0x0 = signbit(vi0x0) ? vi0x0 * vw0 : vi0x0;
+      float vacc0x1 = signbit(vi0x1) ? vi0x1 * vw1 : vi0x1;
+      float vacc0x2 = signbit(vi0x2) ? vi0x2 * vw2 : vi0x2;
+      float vacc0x3 = signbit(vi0x3) ? vi0x3 * vw3 : vi0x3;
+      float vacc1x0 = signbit(vi1x0) ? vi1x0 * vw0 : vi1x0;
+      float vacc1x1 = signbit(vi1x1) ? vi1x1 * vw1 : vi1x1;
+      float vacc1x2 = signbit(vi1x2) ? vi1x2 * vw2 : vi1x2;
+      float vacc1x3 = signbit(vi1x3) ? vi1x3 * vw3 : vi1x3;
+
+      vacc0x0 = __builtin_wasm_max_f32(vacc0x0, vmin);
+      vacc0x1 = __builtin_wasm_max_f32(vacc0x1, vmin);
+      vacc0x2 = __builtin_wasm_max_f32(vacc0x2, vmin);
+      vacc0x3 = __builtin_wasm_max_f32(vacc0x3, vmin);
+      vacc1x0 = __builtin_wasm_max_f32(vacc1x0, vmin);
+      vacc1x1 = __builtin_wasm_max_f32(vacc1x1, vmin);
+      vacc1x2 = __builtin_wasm_max_f32(vacc1x2, vmin);
+      vacc1x3 = __builtin_wasm_max_f32(vacc1x3, vmin);
+
+      vacc0x0 = __builtin_wasm_min_f32(vacc0x0, vmax);
+      vacc0x1 = __builtin_wasm_min_f32(vacc0x1, vmax);
+      vacc0x2 = __builtin_wasm_min_f32(vacc0x2, vmax);
+      vacc0x3 = __builtin_wasm_min_f32(vacc0x3, vmax);
+      vacc1x0 = __builtin_wasm_min_f32(vacc1x0, vmax);
+      vacc1x1 = __builtin_wasm_min_f32(vacc1x1, vmax);
+      vacc1x2 = __builtin_wasm_min_f32(vacc1x2, vmax);
+      vacc1x3 = __builtin_wasm_min_f32(vacc1x3, vmax);
+
+      o0[0] = vacc0x0;
+      o0[1] = vacc0x1;
+      o0[2] = vacc0x2;
+      o0[3] = vacc0x3;
+      o0 += 4;
+      o1[0] = vacc1x0;
+      o1[1] = vacc1x1;
+      o1[2] = vacc1x2;
+      o1[3] = vacc1x3;
+      o1 += 4;
+
+      w += 4;
+    }
+    for (; c != 0; c -= sizeof(float)) {
+      const float vw = *w++;
+
+      const float vi0 = *i0++;
+      const float vi1 = *i1++;
+
+      float vacc0 = signbit(vi0) ? vi0 * vw : vi0;
+      float vacc1 = signbit(vi1) ? vi1 * vw : vi1;
+
+      vacc0 = __builtin_wasm_max_f32(vacc0, vmin);
+      vacc1 = __builtin_wasm_max_f32(vacc1, vmin);
+
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      vacc1 = __builtin_wasm_min_f32(vacc1, vmax);
+
+      *o0++ = vacc0;
+      *o1++ = vacc1;
+    }
+    i0 = (const float*) ((uintptr_t) i0 + input_increment);
+    o0 = (float*) ((uintptr_t) o0 + output_increment);
+    i1 = (const float*) ((uintptr_t) i1 + input_increment);
+    o1 = (float*) ((uintptr_t) o1 + output_increment);
+    if XNN_UNPREDICTABLE(rows < 4) {
+      i1 = i0;
+      o1 = o0;
+    }
+    rows = doz(rows, 2);
+  } while (rows != 0);
+}

diff --git a/src/f32-prelu/scalar.c.in b/src/f32-prelu/scalar.c.in
index f776987..e0f9bc3 100644
--- a/src/f32-prelu/scalar.c.in
+++ b/src/f32-prelu/scalar.c.in

@@ -14,7 +14,9 @@
 #include <xnnpack/prelu.h>
 
 
-void xnn_f32_prelu_ukernel__scalar_${ROW_TILE}x${CHANNEL_TILE}(
+$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
+$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
+void xnn_f32_prelu_ukernel__${"wasm" if WASM else "scalar"}_${ROW_TILE}x${CHANNEL_TILE}(
     size_t rows,
     size_t channels,
     const float*restrict input,
@@ -68,11 +70,11 @@
 
         $for M in range(ROW_TILE):
           $for C in range(CHANNEL_TILE):
-            vacc${M}x${ABC[C]} = math_max_f32(vacc${M}x${ABC[C]}, vmin);
+            vacc${M}x${ABC[C]} = ${MAX_F32}(vacc${M}x${ABC[C]}, vmin);
 
         $for M in range(ROW_TILE):
           $for C in range(CHANNEL_TILE):
-            vacc${M}x${ABC[C]} = math_min_f32(vacc${M}x${ABC[C]}, vmax);
+            vacc${M}x${ABC[C]} = ${MIN_F32}(vacc${M}x${ABC[C]}, vmax);
 
         $for M in range(ROW_TILE):
           $for C in range(CHANNEL_TILE):
@@ -91,10 +93,10 @@
           float vacc${M} = signbit(vi${M}) ? vi${M} * vw : vi${M};
 
         $for M in range(ROW_TILE):
-          vacc${M} = math_max_f32(vacc${M}, vmin);
+          vacc${M} = ${MAX_F32}(vacc${M}, vmin);
 
         $for M in range(ROW_TILE):
-          vacc${M} = math_min_f32(vacc${M}, vmax);
+          vacc${M} = ${MIN_F32}(vacc${M}, vmax);
 
         $for M in range(ROW_TILE):
           *o${M}++ = vacc${M};
@@ -112,11 +114,11 @@
 
         $for M in range(ROW_TILE):
           $for C in range(CHANNEL_TILE):
-            vacc${M} = math_max_f32(vacc${M}, vmin);
+            vacc${M} = ${MAX_F32}(vacc${M}, vmin);
 
         $for M in range(ROW_TILE):
           $for C in range(CHANNEL_TILE):
-            vacc${M} = math_min_f32(vacc${M}, vmax);
+            vacc${M} = ${MIN_F32}(vacc${M}, vmax);
 
         $for M in range(ROW_TILE):
           *o${M}++ = vacc${M};

diff --git a/src/f32-vbinary/gen/vadd-wasm-x1.c b/src/f32-vbinary/gen/vadd-wasm-x1.c
new file mode 100644
index 0000000..ee2811e
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-wasm-x1.c

@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va + vb;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y++ = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vadd-wasm-x2.c b/src/f32-vbinary/gen/vadd-wasm-x2.c
new file mode 100644
index 0000000..86fb847
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-wasm-x2.c

@@ -0,0 +1,60 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va + vb;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vadd-wasm-x4.c b/src/f32-vbinary/gen/vadd-wasm-x4.c
new file mode 100644
index 0000000..11da45e
--- /dev/null
+++ b/src/f32-vbinary/gen/vadd-wasm-x4.c

@@ -0,0 +1,75 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vadd_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 + vb0;
+    float vy1 = va1 + vb1;
+    float vy2 = va2 + vb2;
+    float vy3 = va3 + vb3;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va + vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}

diff --git a/src/f32-vbinary/gen/vaddc-wasm-x1.c b/src/f32-vbinary/gen/vaddc-wasm-x1.c
new file mode 100644
index 0000000..141a621
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-wasm-x1.c

@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va + vb;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y++ = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vaddc-wasm-x2.c b/src/f32-vbinary/gen/vaddc-wasm-x2.c
new file mode 100644
index 0000000..bd4d0c6
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-wasm-x2.c

@@ -0,0 +1,56 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va + vb;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vaddc-wasm-x4.c b/src/f32-vbinary/gen/vaddc-wasm-x4.c
new file mode 100644
index 0000000..4a99e20
--- /dev/null
+++ b/src/f32-vbinary/gen/vaddc-wasm-x4.c

@@ -0,0 +1,69 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vaddc_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 + vb;
+    float vy1 = va1 + vb;
+    float vy2 = va2 + vb;
+    float vy3 = va3 + vb;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va + vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}

diff --git a/src/f32-vbinary/gen/vmul-wasm-x1.c b/src/f32-vbinary/gen/vmul-wasm-x1.c
new file mode 100644
index 0000000..f837a8a
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-wasm-x1.c

@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va * vb;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y++ = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vmul-wasm-x2.c b/src/f32-vbinary/gen/vmul-wasm-x2.c
new file mode 100644
index 0000000..7b5b038
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-wasm-x2.c

@@ -0,0 +1,60 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va * vb;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vmul-wasm-x4.c b/src/f32-vbinary/gen/vmul-wasm-x4.c
new file mode 100644
index 0000000..518dcfc
--- /dev/null
+++ b/src/f32-vbinary/gen/vmul-wasm-x4.c

@@ -0,0 +1,75 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmul_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 * vb0;
+    float vy1 = va1 * vb1;
+    float vy2 = va2 * vb2;
+    float vy3 = va3 * vb3;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va * vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}

diff --git a/src/f32-vbinary/gen/vmulc-wasm-x1.c b/src/f32-vbinary/gen/vmulc-wasm-x1.c
new file mode 100644
index 0000000..dfbf606
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-wasm-x1.c

@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va * vb;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y++ = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vmulc-wasm-x2.c b/src/f32-vbinary/gen/vmulc-wasm-x2.c
new file mode 100644
index 0000000..ce8570d
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-wasm-x2.c

@@ -0,0 +1,56 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va * vb;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vmulc-wasm-x4.c b/src/f32-vbinary/gen/vmulc-wasm-x4.c
new file mode 100644
index 0000000..820838b
--- /dev/null
+++ b/src/f32-vbinary/gen/vmulc-wasm-x4.c

@@ -0,0 +1,69 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vmulc_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 * vb;
+    float vy1 = va1 * vb;
+    float vy2 = va2 * vb;
+    float vy3 = va3 * vb;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va * vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}

diff --git a/src/f32-vbinary/gen/vrsubc-wasm-x1.c b/src/f32-vbinary/gen/vrsubc-wasm-x1.c
new file mode 100644
index 0000000..fc6ba71
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-wasm-x1.c

@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = vb - va;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y++ = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vrsubc-wasm-x2.c b/src/f32-vbinary/gen/vrsubc-wasm-x2.c
new file mode 100644
index 0000000..fec9c71
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-wasm-x2.c

@@ -0,0 +1,56 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = vb - va;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vrsubc-wasm-x4.c b/src/f32-vbinary/gen/vrsubc-wasm-x4.c
new file mode 100644
index 0000000..47aa49b
--- /dev/null
+++ b/src/f32-vbinary/gen/vrsubc-wasm-x4.c

@@ -0,0 +1,69 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vrsubc_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = vb - va0;
+    float vy1 = vb - va1;
+    float vy2 = vb - va2;
+    float vy3 = vb - va3;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = vb - va;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}

diff --git a/src/f32-vbinary/gen/vsub-wasm-x1.c b/src/f32-vbinary/gen/vsub-wasm-x1.c
new file mode 100644
index 0000000..afd215f
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-wasm-x1.c

@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    const float vb = *b++;
+    float vy = va - vb;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y++ = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vsub-wasm-x2.c b/src/f32-vbinary/gen/vsub-wasm-x2.c
new file mode 100644
index 0000000..7b5da3d
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-wasm-x2.c

@@ -0,0 +1,60 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    b += 2;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    const float vb = *b;
+    float vy = va - vb;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vsub-wasm-x4.c b/src/f32-vbinary/gen/vsub-wasm-x4.c
new file mode 100644
index 0000000..64a4e14
--- /dev/null
+++ b/src/f32-vbinary/gen/vsub-wasm-x4.c

@@ -0,0 +1,75 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vop-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsub_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    const float vb0 = b[0];
+    const float vb1 = b[1];
+    const float vb2 = b[2];
+    const float vb3 = b[3];
+    b += 4;
+
+    float vy0 = va0 - vb0;
+    float vy1 = va1 - vb1;
+    float vy2 = va2 - vb2;
+    float vy3 = va3 - vb3;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      const float vb = *b++;
+      float vy = va - vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}

diff --git a/src/f32-vbinary/gen/vsubc-wasm-x1.c b/src/f32-vbinary/gen/vsubc-wasm-x1.c
new file mode 100644
index 0000000..27ee045
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-wasm-x1.c

@@ -0,0 +1,38 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_ukernel__wasm_x1(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= sizeof(float); n -= sizeof(float)) {
+    const float va = *a++;
+    float vy = va - vb;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y++ = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vsubc-wasm-x2.c b/src/f32-vbinary/gen/vsubc-wasm-x2.c
new file mode 100644
index 0000000..9ffd026
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-wasm-x2.c

@@ -0,0 +1,56 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_ukernel__wasm_x2(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 2 * sizeof(float); n -= 2 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    a += 2;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const float va = *a;
+    float vy = va - vb;
+    vy = __builtin_wasm_max_f32(vy, vy_min);
+    vy = __builtin_wasm_min_f32(vy, vy_max);
+    *y = vy;
+  }
+}

diff --git a/src/f32-vbinary/gen/vsubc-wasm-x4.c b/src/f32-vbinary/gen/vsubc-wasm-x4.c
new file mode 100644
index 0000000..1a02d0a
--- /dev/null
+++ b/src/f32-vbinary/gen/vsubc-wasm-x4.c

@@ -0,0 +1,69 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vbinary/vopc-scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/vbinary.h>
+
+
+void xnn_f32_vsubc_ukernel__wasm_x4(
+    size_t n,
+    const float* a,
+    const float* b,
+    float* y,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(n != 0);
+  assert(n % sizeof(float) == 0);
+
+  const float vy_min = params->scalar.min;
+  const float vy_max = params->scalar.max;
+
+  const float vb = *b;
+  for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
+    const float va0 = a[0];
+    const float va1 = a[1];
+    const float va2 = a[2];
+    const float va3 = a[3];
+    a += 4;
+
+    float vy0 = va0 - vb;
+    float vy1 = va1 - vb;
+    float vy2 = va2 - vb;
+    float vy3 = va3 - vb;
+
+    vy0 = __builtin_wasm_max_f32(vy0, vy_min);
+    vy1 = __builtin_wasm_max_f32(vy1, vy_min);
+    vy2 = __builtin_wasm_max_f32(vy2, vy_min);
+    vy3 = __builtin_wasm_max_f32(vy3, vy_min);
+
+    vy0 = __builtin_wasm_min_f32(vy0, vy_max);
+    vy1 = __builtin_wasm_min_f32(vy1, vy_max);
+    vy2 = __builtin_wasm_min_f32(vy2, vy_max);
+    vy3 = __builtin_wasm_min_f32(vy3, vy_max);
+
+    y[0] = vy0;
+    y[1] = vy1;
+    y[2] = vy2;
+    y[3] = vy3;
+    y += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const float va = *a++;
+      float vy = va - vb;
+      vy = __builtin_wasm_max_f32(vy, vy_min);
+      vy = __builtin_wasm_min_f32(vy, vy_max);
+      *y++ = vy;
+      n -= sizeof(float);
+    } while (n != 0);
+  }
+}

diff --git a/src/f32-vbinary/vop-scalar.c.in b/src/f32-vbinary/vop-scalar.c.in
index 633218a..5ee1eae 100644
--- a/src/f32-vbinary/vop-scalar.c.in
+++ b/src/f32-vbinary/vop-scalar.c.in

@@ -13,12 +13,14 @@
 #include <xnnpack/vbinary.h>
 
 
+$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
+$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
 $OP_FUNC = {
 $  "ADD": lambda x, y: "%s + %s" % (x, y),
 $  "MUL": lambda x, y: "%s * %s" % (x, y),
 $  "SUB": lambda x, y: "%s - %s" % (x, y),
 $}[OP]
-void xnn_f32_v${OP.lower()}_ukernel__scalar_x${BATCH_TILE}(
+void xnn_f32_v${OP.lower()}_ukernel__${"wasm" if WASM else "scalar"}_x${BATCH_TILE}(
     size_t n,
     const float* a,
     const float* b,
@@ -45,10 +47,10 @@
         float vy${ABC[N]} = ${OP_FUNC("va" + ABC[N], "vb" + ABC[N])};
 
       $for N in range(BATCH_TILE):
-        vy${ABC[N]} = math_max_f32(vy${ABC[N]}, vy_min);
+        vy${ABC[N]} = ${MAX_F32}(vy${ABC[N]}, vy_min);
 
       $for N in range(BATCH_TILE):
-        vy${ABC[N]} = math_min_f32(vy${ABC[N]}, vy_max);
+        vy${ABC[N]} = ${MIN_F32}(vy${ABC[N]}, vy_max);
 
       $for N in range(BATCH_TILE):
         y[${N}] = vy${ABC[N]};
@@ -60,8 +62,8 @@
           const float va = *a++;
           const float vb = *b++;
           float vy = ${OP_FUNC("va", "vb")};
-          vy = math_max_f32(vy, vy_min);
-          vy = math_min_f32(vy, vy_max);
+          vy = ${MAX_F32}(vy, vy_min);
+          vy = ${MIN_F32}(vy, vy_max);
           *y++ = vy;
           n -= sizeof(float);
         } while (n != 0);
@@ -69,8 +71,8 @@
         const float va = *a;
         const float vb = *b;
         float vy = ${OP_FUNC("va", "vb")};
-        vy = math_max_f32(vy, vy_min);
-        vy = math_min_f32(vy, vy_max);
+        vy = ${MAX_F32}(vy, vy_min);
+        vy = ${MIN_F32}(vy, vy_max);
         *y = vy;
     }
   $else:
@@ -78,8 +80,8 @@
       const float va = *a++;
       const float vb = *b++;
       float vy = ${OP_FUNC("va", "vb")};
-      vy = math_max_f32(vy, vy_min);
-      vy = math_min_f32(vy, vy_max);
+      vy = ${MAX_F32}(vy, vy_min);
+      vy = ${MIN_F32}(vy, vy_max);
       *y++ = vy;
     }
 }

diff --git a/src/f32-vbinary/vopc-scalar.c.in b/src/f32-vbinary/vopc-scalar.c.in
index 88fb010..15d8e36 100644
--- a/src/f32-vbinary/vopc-scalar.c.in
+++ b/src/f32-vbinary/vopc-scalar.c.in

@@ -13,13 +13,15 @@
 #include <xnnpack/vbinary.h>
 
 
+$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
+$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
 $OP_FUNC = {
 $  "ADD": lambda x: "%s + vb" % x,
 $  "MUL": lambda x: "%s * vb" % x,
 $  "SUB": lambda x: "%s - vb" % x,
 $  "RSUB": lambda x: "vb - %s" % x,
 $}[OP]
-void xnn_f32_v${OP.lower()}c_ukernel__scalar_x${BATCH_TILE}(
+void xnn_f32_v${OP.lower()}c_ukernel__${"wasm" if WASM else "scalar"}_x${BATCH_TILE}(
     size_t n,
     const float* a,
     const float* b,
@@ -43,10 +45,10 @@
         float vy${ABC[N]} = ${OP_FUNC("va" + ABC[N])};
 
       $for N in range(BATCH_TILE):
-        vy${ABC[N]} = math_max_f32(vy${ABC[N]}, vy_min);
+        vy${ABC[N]} = ${MAX_F32}(vy${ABC[N]}, vy_min);
 
       $for N in range(BATCH_TILE):
-        vy${ABC[N]} = math_min_f32(vy${ABC[N]}, vy_max);
+        vy${ABC[N]} = ${MIN_F32}(vy${ABC[N]}, vy_max);
 
       $for N in range(BATCH_TILE):
         y[${N}] = vy${ABC[N]};
@@ -57,24 +59,24 @@
         do {
           const float va = *a++;
           float vy = ${OP_FUNC("va")};
-          vy = math_max_f32(vy, vy_min);
-          vy = math_min_f32(vy, vy_max);
+          vy = ${MAX_F32}(vy, vy_min);
+          vy = ${MIN_F32}(vy, vy_max);
           *y++ = vy;
           n -= sizeof(float);
         } while (n != 0);
       $else:
         const float va = *a;
         float vy = ${OP_FUNC("va")};
-        vy = math_max_f32(vy, vy_min);
-        vy = math_min_f32(vy, vy_max);
+        vy = ${MAX_F32}(vy, vy_min);
+        vy = ${MIN_F32}(vy, vy_max);
         *y = vy;
     }
   $else:
     for (; n >= sizeof(float); n -= sizeof(float)) {
       const float va = *a++;
       float vy = ${OP_FUNC("va")};
-      vy = math_max_f32(vy, vy_min);
-      vy = math_min_f32(vy, vy_max);
+      vy = ${MAX_F32}(vy, vy_min);
+      vy = ${MIN_F32}(vy, vy_max);
       *y++ = vy;
     }
 }

diff --git a/src/f32-vmulcaddc/gen/c1-wasm-2x.c b/src/f32-vmulcaddc/gen/c1-wasm-2x.c
new file mode 100644
index 0000000..16e131a
--- /dev/null
+++ b/src/f32-vmulcaddc/gen/c1-wasm-2x.c

@@ -0,0 +1,80 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vmulcaddc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/vmulcaddc.h>
+
+
+void xnn_f32_vmulcaddc_ukernel_c1__wasm_2x(
+    size_t rows,
+    size_t channels,
+    const float*restrict input,
+    size_t input_stride,
+    const float*restrict weights,
+    float*restrict output,
+    size_t output_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(rows != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+
+  const size_t input_increment = input_stride * 2 - channels;
+  const size_t output_increment = output_stride * 2 - channels;
+
+  const float* i0 = input;
+  float* o0 = output;
+  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
+  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = i0;
+    o1 = o0;
+  }
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* w = weights;
+    size_t c = channels;
+    do {
+      const float vscale = w[0];
+
+      float vacc0 = *i0++;
+      float vacc1 = *i1++;
+
+      const float vbias = w[1];
+
+      vacc0 = vacc0 * vscale + vbias;
+      vacc1 = vacc1 * vscale + vbias;
+
+      vacc0 = __builtin_wasm_max_f32(vacc0, vmin);
+      vacc1 = __builtin_wasm_max_f32(vacc1, vmin);
+
+      vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+      vacc1 = __builtin_wasm_min_f32(vacc1, vmax);
+
+      *o0++ = vacc0;
+      *o1++ = vacc1;
+
+      w += 2;
+      c -= sizeof(float);
+    } while (c != 0);
+    i0 = (const float*) ((uintptr_t) i0 + input_increment);
+    o0 = (float*) ((uintptr_t) o0 + output_increment);
+    i1 = (const float*) ((uintptr_t) i1 + input_increment);
+    o1 = (float*) ((uintptr_t) o1 + output_increment);
+    if XNN_UNPREDICTABLE(rows < 4) {
+      i1 = i0;
+      o1 = o0;
+    }
+    rows = doz(rows, 2);
+  } while (rows != 0);
+}

diff --git a/src/f32-vmulcaddc/gen/c2-wasm-2x.c b/src/f32-vmulcaddc/gen/c2-wasm-2x.c
new file mode 100644
index 0000000..b8eab2f
--- /dev/null
+++ b/src/f32-vmulcaddc/gen/c2-wasm-2x.c

@@ -0,0 +1,119 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vmulcaddc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/vmulcaddc.h>
+
+
+void xnn_f32_vmulcaddc_ukernel_c2__wasm_2x(
+    size_t rows,
+    size_t channels,
+    const float*restrict input,
+    size_t input_stride,
+    const float*restrict weights,
+    float*restrict output,
+    size_t output_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(rows != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+
+  const size_t input_increment = input_stride * 2 - channels;
+  const size_t output_increment = output_stride * 2 - channels;
+
+  const float* i0 = input;
+  float* o0 = output;
+  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
+  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = i0;
+    o1 = o0;
+  }
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* w = weights;
+    size_t c = channels;
+    for (; c >= 2 * sizeof(float); c -= 2 * sizeof(float)) {
+      const float vscale0 = w[0];
+      const float vscale1 = w[1];
+
+      float vacc0x0 = i0[0];
+      float vacc0x1 = i0[1];
+      i0 += 2;
+      float vacc1x0 = i1[0];
+      float vacc1x1 = i1[1];
+      i1 += 2;
+
+      const float vbias0 = w[2];
+      const float vbias1 = w[3];
+
+      vacc0x0 = vacc0x0 * vscale0 + vbias0;
+      vacc0x1 = vacc0x1 * vscale1 + vbias1;
+      vacc1x0 = vacc1x0 * vscale0 + vbias0;
+      vacc1x1 = vacc1x1 * vscale1 + vbias1;
+
+      vacc0x0 = __builtin_wasm_max_f32(vacc0x0, vmin);
+      vacc0x1 = __builtin_wasm_max_f32(vacc0x1, vmin);
+      vacc1x0 = __builtin_wasm_max_f32(vacc1x0, vmin);
+      vacc1x1 = __builtin_wasm_max_f32(vacc1x1, vmin);
+
+      vacc0x0 = __builtin_wasm_min_f32(vacc0x0, vmax);
+      vacc0x1 = __builtin_wasm_min_f32(vacc0x1, vmax);
+      vacc1x0 = __builtin_wasm_min_f32(vacc1x0, vmax);
+      vacc1x1 = __builtin_wasm_min_f32(vacc1x1, vmax);
+
+      o0[0] = vacc0x0;
+      o0[1] = vacc0x1;
+      o0 += 2;
+      o1[0] = vacc1x0;
+      o1[1] = vacc1x1;
+      o1 += 2;
+
+      w += 4;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      do {
+        const float vscale = *w++;
+
+        float vacc0 = *i0++;
+        float vacc1 = *i1++;
+
+        const float vbias = w[1];
+
+        vacc0 = vacc0 * vscale + vbias;
+        vacc1 = vacc1 * vscale + vbias;
+
+        vacc0 = __builtin_wasm_max_f32(vacc0, vmin);
+        vacc1 = __builtin_wasm_max_f32(vacc1, vmin);
+
+        vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+        vacc1 = __builtin_wasm_min_f32(vacc1, vmax);
+
+        *o0++ = vacc0;
+        *o1++ = vacc1;
+
+        c -= sizeof(float);
+      } while (c != 0);
+    }
+    i0 = (const float*) ((uintptr_t) i0 + input_increment);
+    o0 = (float*) ((uintptr_t) o0 + output_increment);
+    i1 = (const float*) ((uintptr_t) i1 + input_increment);
+    o1 = (float*) ((uintptr_t) o1 + output_increment);
+    if XNN_UNPREDICTABLE(rows < 4) {
+      i1 = i0;
+      o1 = o0;
+    }
+    rows = doz(rows, 2);
+  } while (rows != 0);
+}

diff --git a/src/f32-vmulcaddc/gen/c4-wasm-2x.c b/src/f32-vmulcaddc/gen/c4-wasm-2x.c
new file mode 100644
index 0000000..1b45e19
--- /dev/null
+++ b/src/f32-vmulcaddc/gen/c4-wasm-2x.c

@@ -0,0 +1,143 @@
+// Auto-generated file. Do not edit!
+//   Template: src/f32-vmulcaddc/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/vmulcaddc.h>
+
+
+void xnn_f32_vmulcaddc_ukernel_c4__wasm_2x(
+    size_t rows,
+    size_t channels,
+    const float*restrict input,
+    size_t input_stride,
+    const float*restrict weights,
+    float*restrict output,
+    size_t output_stride,
+    const union xnn_f32_output_params params[restrict static 1])
+{
+  assert(rows != 0);
+  assert(channels != 0);
+  assert(channels % sizeof(float) == 0);
+
+  const size_t input_increment = input_stride * 2 - channels;
+  const size_t output_increment = output_stride * 2 - channels;
+
+  const float* i0 = input;
+  float* o0 = output;
+  const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
+  float* o1 = (float*) ((uintptr_t) o0 + output_stride);
+  if XNN_UNPREDICTABLE(rows < 2) {
+    i1 = i0;
+    o1 = o0;
+  }
+
+  const float vmin = params->scalar.min;
+  const float vmax = params->scalar.max;
+  do {
+    const float* w = weights;
+    size_t c = channels;
+    for (; c >= 4 * sizeof(float); c -= 4 * sizeof(float)) {
+      const float vscale0 = w[0];
+      const float vscale1 = w[1];
+      const float vscale2 = w[2];
+      const float vscale3 = w[3];
+
+      float vacc0x0 = i0[0];
+      float vacc0x1 = i0[1];
+      float vacc0x2 = i0[2];
+      float vacc0x3 = i0[3];
+      i0 += 4;
+      float vacc1x0 = i1[0];
+      float vacc1x1 = i1[1];
+      float vacc1x2 = i1[2];
+      float vacc1x3 = i1[3];
+      i1 += 4;
+
+      const float vbias0 = w[4];
+      const float vbias1 = w[5];
+      const float vbias2 = w[6];
+      const float vbias3 = w[7];
+
+      vacc0x0 = vacc0x0 * vscale0 + vbias0;
+      vacc0x1 = vacc0x1 * vscale1 + vbias1;
+      vacc0x2 = vacc0x2 * vscale2 + vbias2;
+      vacc0x3 = vacc0x3 * vscale3 + vbias3;
+      vacc1x0 = vacc1x0 * vscale0 + vbias0;
+      vacc1x1 = vacc1x1 * vscale1 + vbias1;
+      vacc1x2 = vacc1x2 * vscale2 + vbias2;
+      vacc1x3 = vacc1x3 * vscale3 + vbias3;
+
+      vacc0x0 = __builtin_wasm_max_f32(vacc0x0, vmin);
+      vacc0x1 = __builtin_wasm_max_f32(vacc0x1, vmin);
+      vacc0x2 = __builtin_wasm_max_f32(vacc0x2, vmin);
+      vacc0x3 = __builtin_wasm_max_f32(vacc0x3, vmin);
+      vacc1x0 = __builtin_wasm_max_f32(vacc1x0, vmin);
+      vacc1x1 = __builtin_wasm_max_f32(vacc1x1, vmin);
+      vacc1x2 = __builtin_wasm_max_f32(vacc1x2, vmin);
+      vacc1x3 = __builtin_wasm_max_f32(vacc1x3, vmin);
+
+      vacc0x0 = __builtin_wasm_min_f32(vacc0x0, vmax);
+      vacc0x1 = __builtin_wasm_min_f32(vacc0x1, vmax);
+      vacc0x2 = __builtin_wasm_min_f32(vacc0x2, vmax);
+      vacc0x3 = __builtin_wasm_min_f32(vacc0x3, vmax);
+      vacc1x0 = __builtin_wasm_min_f32(vacc1x0, vmax);
+      vacc1x1 = __builtin_wasm_min_f32(vacc1x1, vmax);
+      vacc1x2 = __builtin_wasm_min_f32(vacc1x2, vmax);
+      vacc1x3 = __builtin_wasm_min_f32(vacc1x3, vmax);
+
+      o0[0] = vacc0x0;
+      o0[1] = vacc0x1;
+      o0[2] = vacc0x2;
+      o0[3] = vacc0x3;
+      o0 += 4;
+      o1[0] = vacc1x0;
+      o1[1] = vacc1x1;
+      o1[2] = vacc1x2;
+      o1[3] = vacc1x3;
+      o1 += 4;
+
+      w += 8;
+    }
+    if XNN_UNLIKELY(c != 0) {
+      do {
+        const float vscale = *w++;
+
+        float vacc0 = *i0++;
+        float vacc1 = *i1++;
+
+        const float vbias = w[3];
+
+        vacc0 = vacc0 * vscale + vbias;
+        vacc1 = vacc1 * vscale + vbias;
+
+        vacc0 = __builtin_wasm_max_f32(vacc0, vmin);
+        vacc1 = __builtin_wasm_max_f32(vacc1, vmin);
+
+        vacc0 = __builtin_wasm_min_f32(vacc0, vmax);
+        vacc1 = __builtin_wasm_min_f32(vacc1, vmax);
+
+        *o0++ = vacc0;
+        *o1++ = vacc1;
+
+        c -= sizeof(float);
+      } while (c != 0);
+    }
+    i0 = (const float*) ((uintptr_t) i0 + input_increment);
+    o0 = (float*) ((uintptr_t) o0 + output_increment);
+    i1 = (const float*) ((uintptr_t) i1 + input_increment);
+    o1 = (float*) ((uintptr_t) o1 + output_increment);
+    if XNN_UNPREDICTABLE(rows < 4) {
+      i1 = i0;
+      o1 = o0;
+    }
+    rows = doz(rows, 2);
+  } while (rows != 0);
+}

diff --git a/src/f32-vmulcaddc/scalar.c.in b/src/f32-vmulcaddc/scalar.c.in
index f25f500..fccad73 100644
--- a/src/f32-vmulcaddc/scalar.c.in
+++ b/src/f32-vmulcaddc/scalar.c.in

@@ -12,7 +12,9 @@
 #include <xnnpack/vmulcaddc.h>
 
 
-void xnn_f32_vmulcaddc_ukernel_c${CHANNEL_TILE}__scalar_${ROW_TILE}x(
+$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
+$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
+void xnn_f32_vmulcaddc_ukernel_c${CHANNEL_TILE}__${"wasm" if WASM else "scalar"}_${ROW_TILE}x(
     size_t rows,
     size_t channels,
     const float*restrict input,
@@ -69,11 +71,11 @@
 
         $for M in range(ROW_TILE):
           $for C in range(CHANNEL_TILE):
-            vacc${M}x${ABC[C]} = math_max_f32(vacc${M}x${ABC[C]}, vmin);
+            vacc${M}x${ABC[C]} = ${MAX_F32}(vacc${M}x${ABC[C]}, vmin);
 
         $for M in range(ROW_TILE):
           $for C in range(CHANNEL_TILE):
-            vacc${M}x${ABC[C]} = math_min_f32(vacc${M}x${ABC[C]}, vmax);
+            vacc${M}x${ABC[C]} = ${MIN_F32}(vacc${M}x${ABC[C]}, vmax);
 
         $for M in range(ROW_TILE):
           $for C in range(CHANNEL_TILE):
@@ -95,10 +97,10 @@
             vacc${M} = vacc${M} * vscale + vbias;
 
           $for M in range(ROW_TILE):
-            vacc${M} = math_max_f32(vacc${M}, vmin);
+            vacc${M} = ${MAX_F32}(vacc${M}, vmin);
 
           $for M in range(ROW_TILE):
-            vacc${M} = math_min_f32(vacc${M}, vmax);
+            vacc${M} = ${MIN_F32}(vacc${M}, vmax);
 
           $for M in range(ROW_TILE):
             *o${M}++ = vacc${M};
@@ -119,10 +121,10 @@
           vacc${M} = vacc${M} * vscale + vbias;
 
         $for M in range(ROW_TILE):
-          vacc${M} = math_max_f32(vacc${M}, vmin);
+          vacc${M} = ${MAX_F32}(vacc${M}, vmin);
 
         $for M in range(ROW_TILE):
-          vacc${M} = math_min_f32(vacc${M}, vmax);
+          vacc${M} = ${MIN_F32}(vacc${M}, vmax);
 
         $for M in range(ROW_TILE):
           *o${M}++ = vacc${M};

diff --git a/src/init.c b/src/init.c
index 5c32056..e189b8b 100644
--- a/src/init.c
+++ b/src/init.c

@@ -1134,61 +1134,61 @@
       xnn_params.f32.gemm = (struct gemm_parameters) {
         .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_2x4__scalar,
         .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_2x4__scalar,
-        .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar,
-        .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar,
+        .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm,
+        .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm,
         .mr = 2,
         .nr = 4,
       };
     } else {
       xnn_params.f32.gemm = (struct gemm_parameters) {
-        .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__scalar,
-        .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__scalar,
-        .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__scalar,
-        .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__scalar,
+        .gemm = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_4x4__wasm,
+        .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x4__wasm,
+        .gemm1 = (xnn_gemm_ukernel_function) xnn_f32_gemm_ukernel_1x4__wasm,
+        .igemm1 = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_1x4__wasm,
         .mr = 4,
         .nr = 4,
       };
     }
     xnn_params.f32.gemm2 = (struct gemm_parameters) {
       .gemm = NULL,
-      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__scalar,
+      .igemm = (xnn_igemm_ukernel_function) xnn_f32_igemm_ukernel_4x2__wasm,
       .mr = 4,
       .nr = 2,
     };
     xnn_params.f32.dwconv[0] = (struct dwconv_parameters) {
-      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__scalar_acc2,
+      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x4__wasm_acc2,
       .cr = 1,
       .mr = 4,
     };
     xnn_params.f32.dwconv[1] = (struct dwconv_parameters) {
-      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__scalar_acc2,
+      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x9__wasm_acc2,
       .cr = 1,
       .mr = 9,
     };
     xnn_params.f32.dwconv[2] = (struct dwconv_parameters) {
-      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__scalar_acc2,
+      .up = (xnn_dwconv_up_ukernel_function) xnn_f32_dwconv_ukernel_up1x25__wasm_acc2,
       .cr = 1,
       .mr = 25,
     };
     xnn_params.f32.avgpool = (struct avgpool_parameters) {
-      .up = (xnn_avgpool_up_ukernel_function) xnn_f32_avgpool_ukernel_up9__scalar,
-      .mp = (xnn_avgpool_mp_ukernel_function) xnn_f32_avgpool_ukernel_mp9p8q__scalar,
+      .up = (xnn_avgpool_up_ukernel_function) xnn_f32_avgpool_ukernel_up9__wasm,
+      .mp = (xnn_avgpool_mp_ukernel_function) xnn_f32_avgpool_ukernel_mp9p8q__wasm,
       .mr = 9,
       .qr = 8,
     };
     xnn_params.f32.pavgpool = (struct pavgpool_parameters) {
-      .up = (xnn_pavgpool_up_ukernel_function) xnn_f32_pavgpool_ukernel_up9__scalar,
-      .mp = (xnn_pavgpool_mp_ukernel_function) xnn_f32_pavgpool_ukernel_mp9p8q__scalar,
+      .up = (xnn_pavgpool_up_ukernel_function) xnn_f32_pavgpool_ukernel_up9__wasm,
+      .mp = (xnn_pavgpool_mp_ukernel_function) xnn_f32_pavgpool_ukernel_mp9p8q__wasm,
       .mr = 9,
       .qr = 8,
     };
     xnn_params.f32.gavgpool = (struct gavgpool_parameters) {
-      .up = (xnn_gavgpool_up_ukernel_function) xnn_f32_gavgpool_ukernel_up7__scalar,
-      .mp = (xnn_gavgpool_mp_ukernel_function) xnn_f32_gavgpool_ukernel_mp7p7q__scalar,
+      .up = (xnn_gavgpool_up_ukernel_function) xnn_f32_gavgpool_ukernel_up7__wasm,
+      .mp = (xnn_gavgpool_mp_ukernel_function) xnn_f32_gavgpool_ukernel_mp7p7q__wasm,
       .mr = 7,
     };
     xnn_params.f32.maxpool = (struct maxpool_parameters) {
-      .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__scalar_c1,
+      .ukernel = (xnn_maxpool_ukernel_function) xnn_f32_maxpool_ukernel_9p8x__wasm_c1,
       .mr = 9,
       .qr = 8,
     };
@@ -1210,33 +1210,33 @@
       .pixel_tile = 1,
       .channel_tile = 2,
     };
-    xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__scalar;
-    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__scalar;
+    xnn_params.f32.clamp = (xnn_univector_ukernel_function) xnn_f32_clamp_ukernel__wasm;
+    xnn_params.f32.hswish = (xnn_univector_ukernel_function) xnn_f32_hswish_ukernel__wasm;
     xnn_params.f32.prelu = (struct prelu_parameters) {
-      .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__scalar_2x4,
+      .ukernel = (xnn_prelu_ukernel_function) xnn_f32_prelu_ukernel__wasm_2x4,
       .row_tile = 4,
       .channel_tile = 4,
     };
     xnn_params.f32.vadd = (struct vbinary_parameters) {
-      .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__scalar_x4,
-      .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__scalar_x4,
-      .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__scalar_x4,
+      .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vadd_ukernel__wasm_x4,
+      .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasm_x4,
+      .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vaddc_ukernel__wasm_x4,
       .element_tile = 8,
     };
     xnn_params.f32.vmul = (struct vbinary_parameters) {
-      .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__scalar_x4,
-      .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__scalar_x4,
-      .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__scalar_x4,
+      .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmul_ukernel__wasm_x4,
+      .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasm_x4,
+      .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vmulc_ukernel__wasm_x4,
       .element_tile = 8,
     };
     xnn_params.f32.vsub = (struct vbinary_parameters) {
-      .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__scalar_x4,
-      .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__scalar_x4,
-      .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__scalar_x4,
+      .op_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsub_ukernel__wasm_x4,
+      .opc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vsubc_ukernel__wasm_x4,
+      .ropc_ukernel = (xnn_vbinary_ukernel_function) xnn_f32_vrsubc_ukernel__wasm_x4,
       .element_tile = 8,
     };
     xnn_params.f32.vmulcaddc = (struct vmulcaddc_parameters) {
-      .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c1__scalar_2x,
+      .ukernel = (xnn_vmulcaddc_ukernel_function) xnn_f32_vmulcaddc_ukernel_c1__wasm_2x,
       .channel_tile = 1,
       .row_tile = 2,
     };

diff --git a/src/xnnpack/avgpool.h b/src/xnnpack/avgpool.h
index d838fb5..bea28ac 100644
--- a/src/xnnpack/avgpool.h
+++ b/src/xnnpack/avgpool.h

@@ -33,9 +33,10 @@
       const union xnn_f32_avgpool_params* params);
 
 DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__neon)
-DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__psimd)
-DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__scalar)
 DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__sse)
+DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__psimd)
+DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__wasm)
+DECLARE_F32_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_mp9p8q__scalar)
 
 
 #define DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
@@ -51,9 +52,10 @@
       const union xnn_f32_avgpool_params* params);
 
 DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__neon)
-DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__psimd)
-DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__scalar)
 DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__sse)
+DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__psimd)
+DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__wasm)
+DECLARE_F32_AVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_avgpool_ukernel_up9__scalar)
 
 
 #define DECLARE_Q8_AVGPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name)           \

diff --git a/src/xnnpack/clamp.h b/src/xnnpack/clamp.h
index 7a9c67b..ef7a4b6 100644
--- a/src/xnnpack/clamp.h
+++ b/src/xnnpack/clamp.h

@@ -31,6 +31,7 @@
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__avx)
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__avx512f)
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__psimd)
+DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__wasm)
 DECLARE_F32_CLAMP_UKERNEL_FUNCTION(xnn_f32_clamp_ukernel__scalar)
 
 

diff --git a/src/xnnpack/dwconv.h b/src/xnnpack/dwconv.h
index 45069b9..6c77827 100644
--- a/src/xnnpack/dwconv.h
+++ b/src/xnnpack/dwconv.h

@@ -29,10 +29,6 @@
     size_t output_increment,                                 \
     const union xnn_f32_output_params* params);
 
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x4__scalar)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x4__scalar_acc2)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x4__scalar)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x4__psimd)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x4__psimd_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x4__psimd)
@@ -53,11 +49,15 @@
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up16x4__avx512f_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up32x4__avx512f)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up32x4__avx512f_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x4__wasm)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x4__wasm_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x4__wasm)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x4__wasm_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x4__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x4__scalar_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x4__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x4__scalar_acc2)
 
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x9__scalar)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x9__scalar_acc2)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x9__scalar)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neon)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x9__neon_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x9__neon)
@@ -88,11 +88,15 @@
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up16x9__avx512f_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up32x9__avx512f)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up32x9__avx512f_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x9__wasm)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x9__wasm_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x9__wasm)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x9__wasm_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x9__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x9__scalar_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x9__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x9__scalar_acc2)
 
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x25__scalar)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x25__scalar_acc2)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x25__scalar)
-DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__psimd)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up4x25__psimd_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up8x25__psimd)
@@ -113,6 +117,14 @@
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up16x25__avx512f_acc2)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up32x25__avx512f)
 DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up32x25__avx512f_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x25__wasm)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x25__wasm_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x25__wasm)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x25__wasm_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x25__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up1x25__scalar_acc2)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x25__scalar)
+DECLARE_F32_DWCONV_UNIPASS_UKERNEL_FUNCTION(xnn_f32_dwconv_ukernel_up2x25__scalar_acc2)
 
 
 #define DECLARE_Q8_DWCONV_UNIPASS_UKERNEL_FUNCTION(fn_name) \

diff --git a/src/xnnpack/gavgpool.h b/src/xnnpack/gavgpool.h
index ebfaa28..b8b5285 100644
--- a/src/xnnpack/gavgpool.h
+++ b/src/xnnpack/gavgpool.h

@@ -31,9 +31,10 @@
       const union xnn_f32_avgpool_params* params);
 
 DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__neon)
-DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__psimd)
-DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__scalar)
 DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__sse)
+DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__psimd)
+DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__wasm)
+DECLARE_F32_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_mp7p7q__scalar)
 
 
 #define DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
@@ -47,9 +48,10 @@
       const union xnn_f32_avgpool_params* params);
 
 DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__neon)
-DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__psimd)
-DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__scalar)
 DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__sse)
+DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__psimd)
+DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__wasm)
+DECLARE_F32_GAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_gavgpool_ukernel_up7__scalar)
 
 #define DECLARE_Q8_GAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(fn_name)          \
   XNN_INTERNAL void fn_name(                                             \

diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index 88a9d8b..2688f42 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h

@@ -34,7 +34,6 @@
 
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x4__scalar)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
@@ -52,13 +51,10 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8s4__neonfma)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8s4__psimd)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x8s4__sse)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_2x4__scalar)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__neonfma_lane_ld64)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__scalar)
-DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x4__scalar)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
@@ -119,6 +115,16 @@
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_8x8s4__neon)
 DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_8x8s4__neonfma)
 
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x4__wasm)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_2x4__wasm)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__wasm)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x4__wasm)
+
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_1x4__scalar)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_2x4__scalar)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x2__scalar)
+DECLARE_F32_GEMM_UKERNEL_FUNCTION(xnn_f32_gemm_ukernel_4x4__scalar)
+
 #define DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                          \
       size_t mr,                                      \
@@ -135,7 +141,6 @@
 
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x4__scalar)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75)
@@ -153,13 +158,10 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8s4__neonfma)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8s4__psimd)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x8s4__sse)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_2x4__scalar)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x2__neon_lane_ld64)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x2__neonfma_lane_ld64)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x2__scalar)
-DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x4__scalar)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75)
@@ -219,6 +221,16 @@
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_8x8s4__neon)
 DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_8x8s4__neonfma)
 
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x4__wasm)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_2x4__wasm)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x2__wasm)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x4__wasm)
+
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_1x4__scalar)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_2x4__scalar)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x2__scalar)
+DECLARE_F32_GEMMINC_UKERNEL_FUNCTION(xnn_f32_gemminc_ukernel_4x4__scalar)
+
 #define DECLARE_F16_GEMM_UKERNEL_FUNCTION(fn_name) \
   void fn_name(                                    \
       size_t mr,                                   \

diff --git a/src/xnnpack/hswish.h b/src/xnnpack/hswish.h
index eda1a1f..d80595c 100644
--- a/src/xnnpack/hswish.h
+++ b/src/xnnpack/hswish.h

@@ -23,10 +23,11 @@
       float* y,                                      \
       const union xnn_f32_hswish_params* params);
 
-DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__psimd)
 DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neon)
 DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__neonfma)
 DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__sse)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__psimd)
+DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__wasm)
 DECLARE_F32_HSWISH_UKERNEL_FUNCTION(xnn_f32_hswish_ukernel__scalar)
 
 

diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 96167cd..2c7b64d 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h

@@ -36,7 +36,6 @@
 
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x4__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75)
@@ -54,17 +53,14 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8s4__neonfma)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8s4__psimd)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x8s4__sse)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_2x4__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2c4__psimd)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2c4__sse)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64)
-DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__scalar)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x8__avx_broadcast)
@@ -117,6 +113,16 @@
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_8x8s4__neon)
 DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_8x8s4__neonfma)
 
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x4__wasm)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_2x4__wasm)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__wasm)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__wasm)
+
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_1x4__scalar)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_2x4__scalar)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x2__scalar)
+DECLARE_F32_IGEMM_UKERNEL_FUNCTION(xnn_f32_igemm_ukernel_4x4__scalar)
+
 #define DECLARE_Q8_IGEMM_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                       \
       size_t mr,                                   \

diff --git a/src/xnnpack/math.h b/src/xnnpack/math.h
index 69eda29..a1e1253 100644
--- a/src/xnnpack/math.h
+++ b/src/xnnpack/math.h

@@ -60,17 +60,9 @@
 }
 
 inline static float math_min_f32(float a, float b) {
-  #if defined(__wasm__)
-    return __builtin_wasm_min_f32(a, b);
-  #else
-    return XNN_UNPREDICTABLE(b < a) ? b : a;
-  #endif
+  return XNN_UNPREDICTABLE(b < a) ? b : a;
 }
 
 inline static float math_max_f32(float a, float b) {
-  #if defined(__wasm__)
-    return __builtin_wasm_max_f32(a, b);
-  #else
-    return XNN_UNPREDICTABLE(b < a) ? a : b;
-  #endif
+  return XNN_UNPREDICTABLE(b < a) ? a : b;
 }

diff --git a/src/xnnpack/maxpool.h b/src/xnnpack/maxpool.h
index 6013cba..697d4eb 100644
--- a/src/xnnpack/maxpool.h
+++ b/src/xnnpack/maxpool.h

@@ -31,8 +31,9 @@
       size_t output_increment,                        \
       const union xnn_f32_output_params* params);
 
-DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8x__psimd_c4)
 DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8x__sse_c4)
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8x__psimd_c4)
+DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8x__wasm_c1)
 DECLARE_F32_MAXPOOL_UKERNEL_FUNCTION(xnn_f32_maxpool_ukernel_9p8x__scalar_c1)
 
 

diff --git a/src/xnnpack/pavgpool.h b/src/xnnpack/pavgpool.h
index 3be16d3..5caa61c 100644
--- a/src/xnnpack/pavgpool.h
+++ b/src/xnnpack/pavgpool.h

@@ -31,9 +31,10 @@
       const union xnn_f32_output_params* params);
 
 DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__neon)
-DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__psimd)
-DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__scalar)
 DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__sse)
+DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__psimd)
+DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__wasm)
+DECLARE_F32_PAVGPOOL_MULTIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_mp9p8q__scalar)
 
 
 #define DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(fn_name) \
@@ -50,9 +51,10 @@
       const union xnn_f32_output_params* params);
 
 DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__neon)
-DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__psimd)
-DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__scalar)
 DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__sse)
+DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__psimd)
+DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__wasm)
+DECLARE_F32_PAVGPOOL_UNIPASS_UKERNEL_FUNCTION(xnn_f32_pavgpool_ukernel_up9__scalar)
 
 
 #ifdef __cplusplus

diff --git a/src/xnnpack/prelu.h b/src/xnnpack/prelu.h
index a0fefab..8606b25 100644
--- a/src/xnnpack/prelu.h
+++ b/src/xnnpack/prelu.h

@@ -27,17 +27,24 @@
       size_t output_stride,                                    \
       const union xnn_f32_output_params* params);
 
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__scalar_2x1)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__scalar_2x4)
 DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_2x4)
 DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__neon_2x8)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__psimd_2x4)
-DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__psimd_2x8)
+
 DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse2_2x4)
 DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse2_2x8)
+
 DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse41_2x4)
 DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__sse41_2x8)
 
+DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__psimd_2x4)
+DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__psimd_2x8)
+
+DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasm_2x1)
+DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__wasm_2x4)
+
+DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__scalar_2x1)
+DECLARE_F32_PRELU_UKERNEL_FUNCTION(xnn_f32_prelu_ukernel__scalar_2x4)
+
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/src/xnnpack/vbinary.h b/src/xnnpack/vbinary.h
index 1a7ada3..0a405a3 100644
--- a/src/xnnpack/vbinary.h
+++ b/src/xnnpack/vbinary.h

@@ -29,73 +29,94 @@
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__neon_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__neon_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__sse_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__sse_x8)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__psimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__psimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__wasm_x1)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__wasm_x2)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__wasm_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__scalar_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__sse_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vadd_ukernel__sse_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__neon_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__neon_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__sse_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__sse_x8)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__psimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__psimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__wasm_x1)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__wasm_x2)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__wasm_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__scalar_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__sse_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmul_ukernel__sse_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__neon_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__neon_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__sse_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__sse_x8)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__psimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__psimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__wasm_x1)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__wasm_x2)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__wasm_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__scalar_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__sse_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsub_ukernel__sse_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__neon_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__neon_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__sse_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__sse_x8)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__psimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__psimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__wasm_x1)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__wasm_x2)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__wasm_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__scalar_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__sse_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vaddc_ukernel__sse_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__neon_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__neon_x8)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__psimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__psimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__sse_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__sse_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__wasm_x1)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__wasm_x2)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__wasm_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__scalar_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__sse_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vmulc_ukernel__sse_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__neon_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__neon_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__sse_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__sse_x8)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__psimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__psimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__wasm_x1)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__wasm_x2)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__wasm_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__scalar_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__sse_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vsubc_ukernel__sse_x8)
 
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__neon_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__neon_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__sse_x4)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__sse_x8)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__psimd_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__psimd_x8)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__wasm_x1)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__wasm_x2)
+DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__wasm_x4)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__scalar_x1)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__scalar_x2)
 DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__scalar_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__sse_x4)
-DECLARE_F32_VBINOP_UKERNEL_FUNCTION(xnn_f32_vrsubc_ukernel__sse_x8)
 
 
 #ifdef __cplusplus

diff --git a/src/xnnpack/vmulcaddc.h b/src/xnnpack/vmulcaddc.h
index d659687..4e817e4 100644
--- a/src/xnnpack/vmulcaddc.h
+++ b/src/xnnpack/vmulcaddc.h

@@ -27,17 +27,25 @@
       size_t y_stride,                                  \
       const union xnn_f32_output_params* params);
 
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__neon_2x)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__neon_2x)
+
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__neonfma_2x)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__neonfma_2x)
+
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__psimd_2x)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__psimd_2x)
+
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__sse_2x)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__sse_2x)
+
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c1__wasm_2x)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c2__wasm_2x)
+DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__wasm_2x)
+
 DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c1__scalar_2x)
 DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c2__scalar_2x)
 DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__scalar_2x)
-DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__neon_2x)
-DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__neonfma_2x)
-DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__psimd_2x)
-DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c4__sse_2x)
-DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__neon_2x)
-DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__neonfma_2x)
-DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__psimd_2x)
-DECLARE_F32_VMULCADDC_UKERNEL_FUNCTION(xnn_f32_vmulcaddc_ukernel_c8__sse_2x)
 
 
 #ifdef __cplusplus
commit	436ebe6cc2a7a6cc746ac4bcb8cf95f665ae6c29	[log] [tgz]
author	Marat Dukhan <maratek@google.com>	Wed Dec 04 15:10:12 2019 -0800
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Wed Dec 04 15:10:53 2019 -0800
tree	367cdc7cbcbcaa139c4a55fc72e2c4b26b7bdfdb
parent	05f3f6dc940ea45796c009bd09779f597a99151d [diff]