Generalize QS8 VADD[C] templates to cover QU8 VADD[C] microkernels

- QU8 VADD[C] SSE2 MUL16 microkernels
- QU8 VADD[C] NEON microkernels
- QU8 VADD[C] WAsm SIMD microkernels
- QU8 VADD[C] scalar microkernels
- Unit tests

PiperOrigin-RevId: 385931044
diff --git a/src/qu8-vadd/gen/minmax-neon-ld64-x16.c b/src/qu8-vadd/gen/minmax-neon-ld64-x16.c
new file mode 100644
index 0000000..a6c8756
--- /dev/null
+++ b/src/qu8-vadd/gen/minmax-neon-ld64-x16.c
@@ -0,0 +1,112 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vadd/neon-ld64.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16(
+    size_t n,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  const uint8x8_t va_zero_point = vld1_dup_u8(&params->neon.a_zero_point);
+  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->neon.b_zero_point);
+  const int32x4_t va_multiplier = vld1q_dup_s32(&params->neon.a_multiplier);
+  const int32x4_t vb_multiplier = vld1q_dup_s32(&params->neon.b_multiplier);
+  const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+  const uint8x16_t voutput_min = vld1q_dup_u8(&params->neon.output_min);
+  const uint8x16_t voutput_max = vld1q_dup_u8(&params->neon.output_max);
+
+  for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+    const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
+    const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8;
+    const uint8x8_t va89ABCDEF = vld1_u8(input_a); input_a += 8;
+    const uint8x8_t vb89ABCDEF = vld1_u8(input_b); input_b += 8;
+
+    const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
+    const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
+    const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(va89ABCDEF, va_zero_point));
+    const int16x8_t vxb89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vb89ABCDEF, vb_zero_point));
+
+    int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+    int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+    int32x4_t vacc89AB = vmulq_s32(vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
+    int32x4_t vaccCDEF = vmulq_s32(vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);
+
+    vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
+    vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
+    vacc89AB = vmlaq_s32(vacc89AB, vmovl_s16(vget_low_s16(vxb89ABCDEF)), vb_multiplier);
+    vaccCDEF = vmlaq_s32(vaccCDEF, vmovl_s16(vget_high_s16(vxb89ABCDEF)), vb_multiplier);
+
+    vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+    vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+    vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
+    vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);
+
+    const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+    const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
+
+    uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+
+    vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+
+    vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+
+    vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
+      const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8;
+
+      const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
+      const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
+
+      int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+      int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+
+      vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
+      vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
+
+      vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+      vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+
+      const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+
+      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+      vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+      vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+      if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
+        vst1_u8(output, vout01234567); output += 8;
+        n -= 8 * sizeof(uint8_t);
+      } else {
+        if (n & (4 * sizeof(uint8_t))) {
+          vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_u8(vout01234567), 0); output += 4;
+          vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+        }
+        if (n & (2 * sizeof(uint8_t))) {
+          vst1_lane_u16(__builtin_assume_aligned(output, 1), vreinterpret_u16_u8(vout01234567), 0); output += 2;
+          vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+        }
+        if (n & (1 * sizeof(uint8_t))) {
+          vst1_lane_u8(output, vout01234567, 0);
+        }
+        n = 0;
+      }
+    } while (n != 0);
+  }
+}
diff --git a/src/qu8-vadd/gen/minmax-neon-ld64-x8.c b/src/qu8-vadd/gen/minmax-neon-ld64-x8.c
new file mode 100644
index 0000000..3ff0fb1
--- /dev/null
+++ b/src/qu8-vadd/gen/minmax-neon-ld64-x8.c
@@ -0,0 +1,95 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vadd/neon-ld64.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qu8_vadd_minmax_ukernel__neon_ld64_x8(
+    size_t n,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  const uint8x8_t va_zero_point = vld1_dup_u8(&params->neon.a_zero_point);
+  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->neon.b_zero_point);
+  const int32x4_t va_multiplier = vld1q_dup_s32(&params->neon.a_multiplier);
+  const int32x4_t vb_multiplier = vld1q_dup_s32(&params->neon.b_multiplier);
+  const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
+  const int16x8_t voutput_zero_point = vld1q_dup_s16(&params->neon.output_zero_point);
+  const uint8x16_t voutput_min = vld1q_dup_u8(&params->neon.output_min);
+  const uint8x16_t voutput_max = vld1q_dup_u8(&params->neon.output_max);
+
+  for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+    const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
+    const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8;
+
+    const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
+    const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
+
+    int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+    int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+
+    vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
+    vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
+
+    vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+    vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+
+    const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+
+    uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+
+    vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+
+    vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+    vst1_u8(output, vout01234567); output += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    {
+      const uint8x8_t va01234567 = vld1_u8(input_a);
+      const uint8x8_t vb01234567 = vld1_u8(input_b);
+
+      const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
+      const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
+
+      int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+      int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+
+      vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
+      vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
+
+      vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+      vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+
+      const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+
+      uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+      vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+      vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+      if (n & (4 * sizeof(uint8_t))) {
+        vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_u8(vout01234567), 0); output += 4;
+        vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+      }
+      if (n & (2 * sizeof(uint8_t))) {
+        vst1_lane_u16(__builtin_assume_aligned(output, 1), vreinterpret_u16_u8(vout01234567), 0); output += 2;
+        vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+      }
+      if (n & (1 * sizeof(uint8_t))) {
+        vst1_lane_u8(output, vout01234567, 0);
+      }
+    }
+  }
+}
diff --git a/src/qu8-vadd/gen/minmax-scalar-x1.c b/src/qu8-vadd/gen/minmax-scalar-x1.c
new file mode 100644
index 0000000..a2684e0
--- /dev/null
+++ b/src/qu8-vadd/gen/minmax-scalar-x1.c
@@ -0,0 +1,44 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vadd/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/vadd.h>
+
+
+void xnn_qu8_vadd_minmax_ukernel__scalar_x1(
+    size_t n,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  const int32_t vbias = params->scalar.bias;
+  const int32_t va_multiplier = params->scalar.a_multiplier;
+  const int32_t vb_multiplier = params->scalar.b_multiplier;
+  const int32_t vrounding = params->scalar.rounding;
+  const uint32_t vshift = params->scalar.shift;
+  const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
+  const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
+  const int32_t voutput_zero_point = params->scalar.output_zero_point;
+
+  do {
+    const int32_t va = *input_a++;
+    const int32_t vb = *input_b++;
+    const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
+
+    int32_t vout = asr_s32(vacc + vrounding, vshift);
+    vout = math_max_s32(vout, voutput_min_less_zero_point);
+    vout = math_min_s32(vout, voutput_max_less_zero_point);
+    *output++ = (uint8_t) (vout + voutput_zero_point);
+
+    n -= sizeof(uint8_t);
+  } while (n != 0);
+}
diff --git a/src/qu8-vadd/gen/minmax-scalar-x2.c b/src/qu8-vadd/gen/minmax-scalar-x2.c
new file mode 100644
index 0000000..00fd437
--- /dev/null
+++ b/src/qu8-vadd/gen/minmax-scalar-x2.c
@@ -0,0 +1,72 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vadd/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/vadd.h>
+
+
+void xnn_qu8_vadd_minmax_ukernel__scalar_x2(
+    size_t n,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  const int32_t vbias = params->scalar.bias;
+  const int32_t va_multiplier = params->scalar.a_multiplier;
+  const int32_t vb_multiplier = params->scalar.b_multiplier;
+  const int32_t vrounding = params->scalar.rounding;
+  const uint32_t vshift = params->scalar.shift;
+  const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
+  const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
+  const int32_t voutput_zero_point = params->scalar.output_zero_point;
+
+  for (; n >= 2 * sizeof(uint8_t); n -= 2 * sizeof(uint8_t)) {
+    const int32_t va0 = input_a[0];
+    const int32_t va1 = input_a[1];
+    input_a += 2;
+
+    const int32_t vb0 = input_b[0];
+    int32_t vacc0 = vbias + va0 * va_multiplier;
+    const int32_t vb1 = input_b[1];
+    int32_t vacc1 = vbias + va1 * va_multiplier;
+    input_b += 2;
+
+    vacc0 += vb0 * vb_multiplier;
+    vacc1 += vb1 * vb_multiplier;
+
+    int32_t vout0 = asr_s32(vacc0 + vrounding, vshift);
+    int32_t vout1 = asr_s32(vacc1 + vrounding, vshift);
+
+    vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
+    vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
+
+    vout0 = math_min_s32(vout0, voutput_max_less_zero_point);
+    vout1 = math_min_s32(vout1, voutput_max_less_zero_point);
+
+    vout0 += voutput_zero_point;
+    vout1 += voutput_zero_point;
+
+    output[0] = (uint8_t) vout0;
+    output[1] = (uint8_t) vout1;
+    output += 2;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    const int32_t va = *input_a;
+    const int32_t vb = *input_b;
+    const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
+
+    int32_t vout = asr_s32(vacc + vrounding, vshift);
+    vout = math_max_s32(vout, voutput_min_less_zero_point);
+    vout = math_min_s32(vout, voutput_max_less_zero_point);
+    *output++ = (uint8_t) (vout + voutput_zero_point);
+  }
+}
diff --git a/src/qu8-vadd/gen/minmax-scalar-x4.c b/src/qu8-vadd/gen/minmax-scalar-x4.c
new file mode 100644
index 0000000..5f9237e
--- /dev/null
+++ b/src/qu8-vadd/gen/minmax-scalar-x4.c
@@ -0,0 +1,94 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vadd/scalar.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/vadd.h>
+
+
+void xnn_qu8_vadd_minmax_ukernel__scalar_x4(
+    size_t n,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  const int32_t vbias = params->scalar.bias;
+  const int32_t va_multiplier = params->scalar.a_multiplier;
+  const int32_t vb_multiplier = params->scalar.b_multiplier;
+  const int32_t vrounding = params->scalar.rounding;
+  const uint32_t vshift = params->scalar.shift;
+  const int32_t voutput_min_less_zero_point = params->scalar.output_min_less_zero_point;
+  const int32_t voutput_max_less_zero_point = params->scalar.output_max_less_zero_point;
+  const int32_t voutput_zero_point = params->scalar.output_zero_point;
+
+  for (; n >= 4 * sizeof(uint8_t); n -= 4 * sizeof(uint8_t)) {
+    const int32_t va0 = input_a[0];
+    const int32_t va1 = input_a[1];
+    const int32_t va2 = input_a[2];
+    const int32_t va3 = input_a[3];
+    input_a += 4;
+
+    const int32_t vb0 = input_b[0];
+    int32_t vacc0 = vbias + va0 * va_multiplier;
+    const int32_t vb1 = input_b[1];
+    int32_t vacc1 = vbias + va1 * va_multiplier;
+    const int32_t vb2 = input_b[2];
+    int32_t vacc2 = vbias + va2 * va_multiplier;
+    const int32_t vb3 = input_b[3];
+    int32_t vacc3 = vbias + va3 * va_multiplier;
+    input_b += 4;
+
+    vacc0 += vb0 * vb_multiplier;
+    vacc1 += vb1 * vb_multiplier;
+    vacc2 += vb2 * vb_multiplier;
+    vacc3 += vb3 * vb_multiplier;
+
+    int32_t vout0 = asr_s32(vacc0 + vrounding, vshift);
+    int32_t vout1 = asr_s32(vacc1 + vrounding, vshift);
+    int32_t vout2 = asr_s32(vacc2 + vrounding, vshift);
+    int32_t vout3 = asr_s32(vacc3 + vrounding, vshift);
+
+    vout0 = math_max_s32(vout0, voutput_min_less_zero_point);
+    vout1 = math_max_s32(vout1, voutput_min_less_zero_point);
+    vout2 = math_max_s32(vout2, voutput_min_less_zero_point);
+    vout3 = math_max_s32(vout3, voutput_min_less_zero_point);
+
+    vout0 = math_min_s32(vout0, voutput_max_less_zero_point);
+    vout1 = math_min_s32(vout1, voutput_max_less_zero_point);
+    vout2 = math_min_s32(vout2, voutput_max_less_zero_point);
+    vout3 = math_min_s32(vout3, voutput_max_less_zero_point);
+
+    vout0 += voutput_zero_point;
+    vout1 += voutput_zero_point;
+    vout2 += voutput_zero_point;
+    vout3 += voutput_zero_point;
+
+    output[0] = (uint8_t) vout0;
+    output[1] = (uint8_t) vout1;
+    output[2] = (uint8_t) vout2;
+    output[3] = (uint8_t) vout3;
+    output += 4;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const int32_t va = *input_a++;
+      const int32_t vb = *input_b++;
+      const int32_t vacc = vbias + va * va_multiplier + vb * vb_multiplier;
+
+      int32_t vout = asr_s32(vacc + vrounding, vshift);
+      vout = math_max_s32(vout, voutput_min_less_zero_point);
+      vout = math_min_s32(vout, voutput_max_less_zero_point);
+      *output++ = (uint8_t) (vout + voutput_zero_point);
+
+      n -= sizeof(uint8_t);
+    } while (n != 0);
+  }
+}
diff --git a/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x16.c b/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x16.c
new file mode 100644
index 0000000..7deb623
--- /dev/null
+++ b/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x16.c
@@ -0,0 +1,149 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vadd/sse-mul16-ld64.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x16(
+    size_t n,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
+  const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
+  const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
+  const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
+  const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
+  const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+  const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
+  const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+  const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+
+  for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+    __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
+    __m128i vb01234567 = _mm_loadl_epi64((const __m128i*) input_b);
+    __m128i va89ABCDEF = _mm_loadl_epi64((const __m128i*) (input_a + 8));
+    __m128i vb89ABCDEF = _mm_loadl_epi64((const __m128i*) (input_b + 8));
+    input_a += 16;
+    input_b += 16;
+
+    const __m128i vzero = _mm_setzero_si128();
+    va01234567 = _mm_unpacklo_epi8(va01234567, vzero);
+    vb01234567 = _mm_unpacklo_epi8(vb01234567, vzero);
+    va89ABCDEF = _mm_unpacklo_epi8(va89ABCDEF, vzero);
+    vb89ABCDEF = _mm_unpacklo_epi8(vb89ABCDEF, vzero);
+
+    __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
+    __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
+    const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
+    const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
+    __m128i vaprod89ABCDEFhi = _mm_mulhi_epu16(va89ABCDEF, va_multiplier_lo);
+    __m128i vbprod89ABCDEFhi = _mm_mulhi_epu16(vb89ABCDEF, vb_multiplier_lo);
+    const __m128i vaprod89ABCDEFlo = _mm_mullo_epi16(va89ABCDEF, va_multiplier_lo);
+    const __m128i vbprod89ABCDEFlo = _mm_mullo_epi16(vb89ABCDEF, vb_multiplier_lo);
+
+    vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
+    vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
+    vaprod89ABCDEFhi = _mm_add_epi16(vaprod89ABCDEFhi, _mm_mullo_epi16(va89ABCDEF, va_multiplier_hi));
+    vbprod89ABCDEFhi = _mm_add_epi16(vbprod89ABCDEFhi, _mm_mullo_epi16(vb89ABCDEF, vb_multiplier_hi));
+
+
+    __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
+    __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
+    __m128i vacc89AB = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
+    __m128i vaccCDEF = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod89ABCDEFlo, vaprod89ABCDEFhi));
+
+    vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
+    vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
+    vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
+    vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vbprod89ABCDEFlo, vbprod89ABCDEFhi));
+
+    vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
+    vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+    vacc89AB = _mm_sra_epi32(_mm_add_epi32(vacc89AB, vrounding), vshift);
+    vaccCDEF = _mm_sra_epi32(_mm_add_epi32(vaccCDEF, vrounding), vshift);
+
+    __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+    __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+
+    __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
+
+    vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
+
+    vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
+
+    _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+    output += 16;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
+      __m128i vb01234567 = _mm_loadl_epi64((const __m128i*) input_b);
+      input_a += 8;
+      input_b += 8;
+
+      const __m128i vzero = _mm_setzero_si128();
+      va01234567 = _mm_unpacklo_epi8(va01234567, vzero);
+      vb01234567 = _mm_unpacklo_epi8(vb01234567, vzero);
+
+      __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
+      __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
+      const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
+      const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
+
+      vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
+      vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
+
+
+      __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
+      __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
+
+      vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
+      vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+      __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
+
+      if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
+        _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+        output += 8;
+        n -= 8 * sizeof(uint8_t);
+      } else {
+        if (n & (4 * sizeof(uint8_t))) {
+          *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+          vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+          output += 4;
+        }
+        if (n & (2 * sizeof(uint8_t))) {
+          *((uint16_t*) output) = (uint16_t) _mm_cvtsi128_si32(vout0123456701234567);
+          vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+          output += 2;
+        }
+        if (n & (1 * sizeof(uint8_t))) {
+          *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+        }
+        n = 0;
+      }
+    } while (n != 0);
+  }
+}
diff --git a/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x8.c b/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
new file mode 100644
index 0000000..c8095f3
--- /dev/null
+++ b/src/qu8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
@@ -0,0 +1,123 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vadd/sse-mul16-ld64.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8(
+    size_t n,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  const __m128i vbias = _mm_load_si128((const __m128i*) params->sse2.bias);
+  const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_lo);
+  const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.a_multiplier_hi);
+  const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_lo);
+  const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) params->sse2.b_multiplier_hi);
+  const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
+  const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
+  const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+  const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+  const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+
+  for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+    __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
+    __m128i vb01234567 = _mm_loadl_epi64((const __m128i*) input_b);
+    input_a += 8;
+    input_b += 8;
+
+    const __m128i vzero = _mm_setzero_si128();
+    va01234567 = _mm_unpacklo_epi8(va01234567, vzero);
+    vb01234567 = _mm_unpacklo_epi8(vb01234567, vzero);
+
+    __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
+    __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
+    const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
+    const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
+
+    vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
+    vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
+
+
+    __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
+    __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
+
+    vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
+    vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
+
+    vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
+    vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+
+    __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+
+    __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+
+    vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+
+    vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
+
+    _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+    output += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    {
+      __m128i va01234567 = _mm_loadl_epi64((const __m128i*) input_a);
+      __m128i vb01234567 = _mm_loadl_epi64((const __m128i*) input_b);
+
+      const __m128i vzero = _mm_setzero_si128();
+      va01234567 = _mm_unpacklo_epi8(va01234567, vzero);
+      vb01234567 = _mm_unpacklo_epi8(vb01234567, vzero);
+
+      __m128i vaprod01234567hi = _mm_mulhi_epu16(va01234567, va_multiplier_lo);
+      __m128i vbprod01234567hi = _mm_mulhi_epu16(vb01234567, vb_multiplier_lo);
+      const __m128i vaprod01234567lo = _mm_mullo_epi16(va01234567, va_multiplier_lo);
+      const __m128i vbprod01234567lo = _mm_mullo_epi16(vb01234567, vb_multiplier_lo);
+
+      vaprod01234567hi = _mm_add_epi16(vaprod01234567hi, _mm_mullo_epi16(va01234567, va_multiplier_hi));
+      vbprod01234567hi = _mm_add_epi16(vbprod01234567hi, _mm_mullo_epi16(vb01234567, vb_multiplier_hi));
+
+
+      __m128i vacc0123 = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vaprod01234567lo, vaprod01234567hi));
+      __m128i vacc4567 = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vaprod01234567lo, vaprod01234567hi));
+
+      vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vbprod01234567lo, vbprod01234567hi));
+      vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vbprod01234567lo, vbprod01234567hi));
+
+      vacc0123 = _mm_sra_epi32(_mm_add_epi32(vacc0123, vrounding), vshift);
+      vacc4567 = _mm_sra_epi32(_mm_add_epi32(vacc4567, vrounding), vshift);
+
+      __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+      __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
+      vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
+      vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
+
+      if (n & (4 * sizeof(uint8_t))) {
+        *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+        vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+        output += 4;
+      }
+      if (n & (2 * sizeof(uint8_t))) {
+        *((uint16_t*) output) = (uint16_t) _mm_cvtsi128_si32(vout0123456701234567);
+        vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+        output += 2;
+      }
+      if (n & (1 * sizeof(uint8_t))) {
+        *output = (uint8_t) _mm_cvtsi128_si32(vout0123456701234567);
+      }
+    }
+  }
+}
diff --git a/src/qu8-vadd/gen/minmax-wasmsimd-x16.c b/src/qu8-vadd/gen/minmax-wasmsimd-x16.c
new file mode 100644
index 0000000..563534c
--- /dev/null
+++ b/src/qu8-vadd/gen/minmax-wasmsimd-x16.c
@@ -0,0 +1,112 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vadd/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qu8_vadd_minmax_ukernel__wasmsimd_x16(
+    size_t n,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  const v128_t vbias = wasm_v128_load(params->wasmsimd.bias);
+  const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
+  const v128_t vb_multiplier = wasm_v128_load(params->wasmsimd.b_multiplier);
+  const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
+  const int32_t vshift = params->wasmsimd.shift;
+  const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
+  const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
+  const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+
+  for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+    const v128_t va01234567 = wasm_u16x8_load8x8(input_a);
+    const v128_t vb01234567 = wasm_u16x8_load8x8(input_b);
+    const v128_t va89ABCDEF = wasm_u16x8_load8x8(input_a + 8);
+    const v128_t vb89ABCDEF = wasm_u16x8_load8x8(input_b + 8);
+    input_a += 16;
+    input_b += 16;
+
+    v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va01234567), va_multiplier));
+    v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va01234567), va_multiplier));
+    v128_t vacc89AB = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va89ABCDEF), va_multiplier));
+    v128_t vaccCDEF = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va89ABCDEF), va_multiplier));
+
+    vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vb01234567), vb_multiplier));
+    vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vb01234567), vb_multiplier));
+    vacc89AB = wasm_i32x4_add(vacc89AB, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vb89ABCDEF), vb_multiplier));
+    vaccCDEF = wasm_i32x4_add(vaccCDEF, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vb89ABCDEF), vb_multiplier));
+
+    vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
+    vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+    vacc89AB = wasm_i32x4_shr(wasm_i32x4_add(vacc89AB, vrounding), vshift);
+    vaccCDEF = wasm_i32x4_shr(wasm_i32x4_add(vaccCDEF, vrounding), vshift);
+
+    v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
+    v128_t vout89ABCDEF = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc89AB, vaccCDEF), voutput_zero_point);
+
+    v128_t vout0123456789ABCDEF = wasm_u8x16_narrow_i16x8(vout01234567, vout89ABCDEF);
+
+    vout0123456789ABCDEF = wasm_u8x16_max(vout0123456789ABCDEF, voutput_min);
+
+    vout0123456789ABCDEF = wasm_u8x16_min(vout0123456789ABCDEF, voutput_max);
+
+    wasm_v128_store(output, vout0123456789ABCDEF);
+    output += 16;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    do {
+      const v128_t va01234567 = wasm_u16x8_load8x8(input_a);
+      const v128_t vb01234567 = wasm_u16x8_load8x8(input_b);
+      input_a += 8;
+      input_b += 8;
+
+      v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va01234567), va_multiplier));
+      v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va01234567), va_multiplier));
+
+      vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vb01234567), vb_multiplier));
+      vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vb01234567), vb_multiplier));
+
+      vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
+      vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+
+      v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
+
+      v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+      vout0123456701234567 = wasm_u8x16_max(vout0123456701234567, voutput_min);
+      vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+      if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
+        *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
+        output += 8;
+        n -= 8 * sizeof(uint8_t);
+      } else {
+        if (n & (4 * sizeof(uint8_t))) {
+          *((uint32_t*) output) = (uint32_t) wasm_i32x4_extract_lane(vout0123456701234567, 0);
+          vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
+          output += 4;
+        }
+        if (n & (2 * sizeof(uint8_t))) {
+          *((uint16_t*) output) = (uint16_t) wasm_i16x8_extract_lane(vout0123456701234567, 0);
+          vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16);
+          output += 2;
+        }
+        if (n & (1 * sizeof(uint8_t))) {
+          *output = (uint8_t) wasm_i8x16_extract_lane(vout0123456701234567, 0);
+        }
+        n = 0;
+      }
+    } while (n != 0);
+  }
+}
diff --git a/src/qu8-vadd/gen/minmax-wasmsimd-x8.c b/src/qu8-vadd/gen/minmax-wasmsimd-x8.c
new file mode 100644
index 0000000..df5b45f
--- /dev/null
+++ b/src/qu8-vadd/gen/minmax-wasmsimd-x8.c
@@ -0,0 +1,94 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-vadd/wasmsimd.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qu8_vadd_minmax_ukernel__wasmsimd_x8(
+    size_t n,
+    const uint8_t* input_a,
+    const uint8_t* input_b,
+    uint8_t* output,
+    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  const v128_t vbias = wasm_v128_load(params->wasmsimd.bias);
+  const v128_t va_multiplier = wasm_v128_load(params->wasmsimd.a_multiplier);
+  const v128_t vb_multiplier = wasm_v128_load(params->wasmsimd.b_multiplier);
+  const v128_t vrounding = wasm_v128_load(params->wasmsimd.rounding);
+  const int32_t vshift = params->wasmsimd.shift;
+  const v128_t voutput_zero_point = wasm_v128_load(params->wasmsimd.output_zero_point);
+  const v128_t voutput_min = wasm_v128_load(params->wasmsimd.output_min);
+  const v128_t voutput_max = wasm_v128_load(params->wasmsimd.output_max);
+
+  for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
+    const v128_t va01234567 = wasm_u16x8_load8x8(input_a);
+    const v128_t vb01234567 = wasm_u16x8_load8x8(input_b);
+    input_a += 8;
+    input_b += 8;
+
+    v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va01234567), va_multiplier));
+    v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va01234567), va_multiplier));
+
+    vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vb01234567), vb_multiplier));
+    vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vb01234567), vb_multiplier));
+
+    vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
+    vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+
+    v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
+
+    v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+
+    vout0123456701234567 = wasm_u8x16_max(vout0123456701234567, voutput_min);
+
+    vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+    *((double*) output) = wasm_f64x2_extract_lane(vout0123456701234567, 0);
+    output += 8;
+  }
+  if XNN_UNLIKELY(n != 0) {
+    {
+      const v128_t va01234567 = wasm_u16x8_load8x8(input_a);
+      const v128_t vb01234567 = wasm_u16x8_load8x8(input_b);
+
+      v128_t vacc0123 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(va01234567), va_multiplier));
+      v128_t vacc4567 = wasm_i32x4_add(vbias, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(va01234567), va_multiplier));
+
+      vacc0123 = wasm_i32x4_add(vacc0123, wasm_i32x4_mul(wasm_u32x4_extend_low_u16x8(vb01234567), vb_multiplier));
+      vacc4567 = wasm_i32x4_add(vacc4567, wasm_i32x4_mul(wasm_u32x4_extend_high_u16x8(vb01234567), vb_multiplier));
+
+      vacc0123 = wasm_i32x4_shr(wasm_i32x4_add(vacc0123, vrounding), vshift);
+      vacc4567 = wasm_i32x4_shr(wasm_i32x4_add(vacc4567, vrounding), vshift);
+
+      v128_t vout01234567 = wasm_i16x8_add_sat(wasm_i16x8_narrow_i32x4(vacc0123, vacc4567), voutput_zero_point);
+
+      v128_t vout0123456701234567 = wasm_u8x16_narrow_i16x8(vout01234567, vout01234567);
+      vout0123456701234567 = wasm_u8x16_max(vout0123456701234567, voutput_min);
+      vout0123456701234567 = wasm_u8x16_min(vout0123456701234567, voutput_max);
+
+      if (n & (4 * sizeof(uint8_t))) {
+        *((uint32_t*) output) = (uint32_t) wasm_i32x4_extract_lane(vout0123456701234567, 0);
+        vout0123456701234567 = wasm_u64x2_shr(vout0123456701234567, 32);
+        output += 4;
+      }
+      if (n & (2 * sizeof(uint8_t))) {
+        *((uint16_t*) output) = (uint16_t) wasm_i16x8_extract_lane(vout0123456701234567, 0);
+        vout0123456701234567 = wasm_u32x4_shr(vout0123456701234567, 16);
+        output += 2;
+      }
+      if (n & (1 * sizeof(uint8_t))) {
+        *output = (uint8_t) wasm_i8x16_extract_lane(vout0123456701234567, 0);
+      }
+    }
+  }
+}
diff --git a/src/qu8-vadd/minmax-neon.c b/src/qu8-vadd/minmax-neon.c
deleted file mode 100644
index 2cdca03..0000000
--- a/src/qu8-vadd/minmax-neon.c
+++ /dev/null
@@ -1,246 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <arm_neon.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/vadd.h>
-
-
-void xnn_qu8_vadd_minmax_ukernel__neon_x32(
-    size_t n,
-    const uint8_t* a,
-    const uint8_t* b,
-    uint8_t* y,
-    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  const uint8x8_t va_zero_point = vld1_dup_u8(&params->neon.a_zero_point);
-  const uint8x8_t vb_zero_point = vld1_dup_u8(&params->neon.b_zero_point);
-  const int16x8_t vy_zero_point = vld1q_dup_s16(&params->neon.y_zero_point);
-  const int32x4_t va_multiplier = vld1q_dup_s32(&params->neon.a_multiplier);
-  const int32x4_t vb_multiplier = vld1q_dup_s32(&params->neon.b_multiplier);
-  const int32x4_t vright_shift = vld1q_dup_s32(&params->neon.right_shift);
-  const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
-  const uint8x16_t vy_max = vld1q_dup_u8(&params->neon.y_max);
-  const uint8x16_t vy_min = vld1q_dup_u8(&params->neon.y_min);
-#if XNN_ARCH_ARM64
-  for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
-    const uint8x16_t va01 = vld1q_u8(a); a += 16;
-    const uint8x16_t vb01 = vld1q_u8(b); b += 16;
-    const uint8x16_t va23 = vld1q_u8(a); a += 16;
-    const uint8x16_t vb23 = vld1q_u8(b); b += 16;
-
-    // Subtract zero point.
-    const int16x8_t vxa0 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va01), va_zero_point));
-    const int16x8_t vxb0 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb01), vb_zero_point));
-    const int16x8_t vxa1 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va01), va_zero_point));
-    const int16x8_t vxb1 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(vb01), vb_zero_point));
-    const int16x8_t vxa2 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va23), va_zero_point));
-    const int16x8_t vxb2 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb23), vb_zero_point));
-    const int16x8_t vxa3 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va23), va_zero_point));
-    const int16x8_t vxb3 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(vb23), vb_zero_point));
-
-    // Multiply by factors and accumulate products.
-    int32x4_t vacc0_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa0)), va_multiplier);
-    int32x4_t vacc1_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa1)), va_multiplier);
-    int32x4_t vacc2_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa2)), va_multiplier);
-    int32x4_t vacc3_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa3)), va_multiplier);
-    int32x4_t vacc0_hi = vmulq_s32(vmovl_high_s16(vxa0), va_multiplier);
-    int32x4_t vacc1_hi = vmulq_s32(vmovl_high_s16(vxa1), va_multiplier);
-    int32x4_t vacc2_hi = vmulq_s32(vmovl_high_s16(vxa2), va_multiplier);
-    int32x4_t vacc3_hi = vmulq_s32(vmovl_high_s16(vxa3), va_multiplier);
-
-    vacc0_lo = vmlaq_s32(vacc0_lo, vmovl_s16(vget_low_s16(vxb0)), vb_multiplier);
-    vacc1_lo = vmlaq_s32(vacc1_lo, vmovl_s16(vget_low_s16(vxb1)), vb_multiplier);
-    vacc2_lo = vmlaq_s32(vacc2_lo, vmovl_s16(vget_low_s16(vxb2)), vb_multiplier);
-    vacc3_lo = vmlaq_s32(vacc3_lo, vmovl_s16(vget_low_s16(vxb3)), vb_multiplier);
-    vacc0_hi = vmlaq_s32(vacc0_hi, vmovl_high_s16(vxb0), vb_multiplier);
-    vacc1_hi = vmlaq_s32(vacc1_hi, vmovl_high_s16(vxb1), vb_multiplier);
-    vacc2_hi = vmlaq_s32(vacc2_hi, vmovl_high_s16(vxb2), vb_multiplier);
-    vacc3_hi = vmlaq_s32(vacc3_hi, vmovl_high_s16(vxb3), vb_multiplier);
-
-    // Shift right and round.
-    vacc0_lo = vsraq_n_s32(vacc0_lo, vbicq_s32(vacc0_lo, vzero_shift_mask), 31);
-    vacc1_lo = vsraq_n_s32(vacc1_lo, vbicq_s32(vacc1_lo, vzero_shift_mask), 31);
-    vacc2_lo = vsraq_n_s32(vacc2_lo, vbicq_s32(vacc2_lo, vzero_shift_mask), 31);
-    vacc3_lo = vsraq_n_s32(vacc3_lo, vbicq_s32(vacc3_lo, vzero_shift_mask), 31);
-    vacc0_hi = vsraq_n_s32(vacc0_hi, vbicq_s32(vacc0_hi, vzero_shift_mask), 31);
-    vacc1_hi = vsraq_n_s32(vacc1_hi, vbicq_s32(vacc1_hi, vzero_shift_mask), 31);
-    vacc2_hi = vsraq_n_s32(vacc2_hi, vbicq_s32(vacc2_hi, vzero_shift_mask), 31);
-    vacc3_hi = vsraq_n_s32(vacc3_hi, vbicq_s32(vacc3_hi, vzero_shift_mask), 31);
-
-    vacc0_lo = vrshlq_s32(vacc0_lo, vright_shift);
-    vacc1_lo = vrshlq_s32(vacc1_lo, vright_shift);
-    vacc2_lo = vrshlq_s32(vacc2_lo, vright_shift);
-    vacc3_lo = vrshlq_s32(vacc3_lo, vright_shift);
-    vacc0_hi = vrshlq_s32(vacc0_hi, vright_shift);
-    vacc1_hi = vrshlq_s32(vacc1_hi, vright_shift);
-    vacc2_hi = vrshlq_s32(vacc2_hi, vright_shift);
-    vacc3_hi = vrshlq_s32(vacc3_hi, vright_shift);
-
-    // Pack, saturate, and add output zero point.
-    const int16x8_t vacc0 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0_lo), vacc0_hi), vy_zero_point);
-    const int16x8_t vacc1 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1_lo), vacc1_hi), vy_zero_point);
-    const int16x8_t vacc2 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2_lo), vacc2_hi), vy_zero_point);
-    const int16x8_t vacc3 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3_lo), vacc3_hi), vy_zero_point);
-
-    uint8x16_t vy01 = vqmovun_high_s16(vqmovun_s16(vacc0), vacc1);
-    uint8x16_t vy23 = vqmovun_high_s16(vqmovun_s16(vacc2), vacc3);
-
-    vy01 = vmaxq_u8(vy01, vy_min);
-    vy23 = vmaxq_u8(vy23, vy_min);
-    vy01 = vminq_u8(vy01, vy_max);
-    vy23 = vminq_u8(vy23, vy_max);
-
-    vst1q_u8(y, vy01); y += 16;
-    vst1q_u8(y, vy23); y += 16;
-  }
-#else
-  for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
-    const uint8x16_t va01 = vld1q_u8(a); a += 16;
-    const uint8x16_t vb01 = vld1q_u8(b); b += 16;
-
-    // Subtract zero point.
-    const int16x8_t vxa0 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va01), va_zero_point));
-    const int16x8_t vxb0 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb01), vb_zero_point));
-    const int16x8_t vxa1 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va01), va_zero_point));
-    const int16x8_t vxb1 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(vb01), vb_zero_point));
-
-    // Multiply by factors and accumulate products.
-    int32x4_t vacc0_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa0)), va_multiplier);
-    int32x4_t vacc1_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa1)), va_multiplier);
-    int32x4_t vacc0_hi = vmulq_s32(vmovl_s16(vget_high_s16(vxa0)), va_multiplier);
-    int32x4_t vacc1_hi = vmulq_s32(vmovl_s16(vget_high_s16(vxa1)), va_multiplier);
-
-    __builtin_prefetch(a + 640);
-    __builtin_prefetch(b + 640);
-
-    vacc0_lo = vmlaq_s32(vacc0_lo, vmovl_s16(vget_low_s16(vxb0)), vb_multiplier);
-    vacc1_lo = vmlaq_s32(vacc1_lo, vmovl_s16(vget_low_s16(vxb1)), vb_multiplier);
-    vacc0_hi = vmlaq_s32(vacc0_hi, vmovl_s16(vget_high_s16(vxb0)), vb_multiplier);
-    vacc1_hi = vmlaq_s32(vacc1_hi, vmovl_s16(vget_high_s16(vxb1)), vb_multiplier);
-
-    // Shift right and round.
-    vacc0_lo = vsraq_n_s32(vacc0_lo, vbicq_s32(vacc0_lo, vzero_shift_mask), 31);
-    vacc1_lo = vsraq_n_s32(vacc1_lo, vbicq_s32(vacc1_lo, vzero_shift_mask), 31);
-    vacc0_hi = vsraq_n_s32(vacc0_hi, vbicq_s32(vacc0_hi, vzero_shift_mask), 31);
-    vacc1_hi = vsraq_n_s32(vacc1_hi, vbicq_s32(vacc1_hi, vzero_shift_mask), 31);
-
-    vacc0_lo = vrshlq_s32(vacc0_lo, vright_shift);
-    vacc1_lo = vrshlq_s32(vacc1_lo, vright_shift);
-    vacc0_hi = vrshlq_s32(vacc0_hi, vright_shift);
-    vacc1_hi = vrshlq_s32(vacc1_hi, vright_shift);
-
-    // Pack, saturate, and add output zero point.
-    const int16x8_t vacc0 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0_lo), vqmovn_s32(vacc0_hi)), vy_zero_point);
-    const int16x8_t vacc1 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1_lo), vqmovn_s32(vacc1_hi)), vy_zero_point);
-
-    uint8x16_t vy01 = vcombine_u8(vqmovun_s16(vacc0), vqmovun_s16(vacc1));
-    vy01 = vmaxq_u8(vy01, vy_min);
-    vy01 = vminq_u8(vy01, vy_max);
-
-    vst1q_u8(y, vy01); y += 16;
-  }
-#endif
-  for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
-    const uint8x8_t va = vld1_u8(a); a += 8;
-    const uint8x8_t vb = vld1_u8(b); b += 8;
-
-    // Subtract zero point.
-    const int16x8_t vxa = vreinterpretq_s16_u16(vsubl_u8(va, va_zero_point));
-    const int16x8_t vxb = vreinterpretq_s16_u16(vsubl_u8(vb, vb_zero_point));
-
-    // Multiply by factors and accumulate products.
-    int32x4_t vacc_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa)), va_multiplier);
-#if XNN_ARCH_ARM64
-    int32x4_t vacc_hi = vmulq_s32(vmovl_high_s16(vxa), va_multiplier);
-#else
-    int32x4_t vacc_hi = vmulq_s32(vmovl_s16(vget_high_s16(vxa)), va_multiplier);
-#endif
-
-    vacc_lo = vmlaq_s32(vacc_lo, vmovl_s16(vget_low_s16(vxb)), vb_multiplier);
-#if XNN_ARCH_ARM64
-    vacc_hi = vmlaq_s32(vacc_hi, vmovl_high_s16(vxb), vb_multiplier);
-#else
-    vacc_hi = vmlaq_s32(vacc_hi, vmovl_s16(vget_high_s16(vxb)), vb_multiplier);
-#endif
-
-    // Shift right and round.
-    vacc_lo = vsraq_n_s32(vacc_lo, vbicq_s32(vacc_lo, vzero_shift_mask), 31);
-    vacc_hi = vsraq_n_s32(vacc_hi, vbicq_s32(vacc_hi, vzero_shift_mask), 31);
-
-    vacc_lo = vrshlq_s32(vacc_lo, vright_shift);
-    vacc_hi = vrshlq_s32(vacc_hi, vright_shift);
-
-    // Pack, saturate, and add output zero point.
-#if XNN_ARCH_ARM64
-    const int16x8_t vacc = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi), vy_zero_point);
-#else
-    const int16x8_t vacc = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)), vy_zero_point);
-#endif
-
-    uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(vy_min));
-    vy = vmin_u8(vy, vget_low_u8(vy_max));
-
-    vst1_u8(y, vy); y += 8;
-  }
-  if (n != 0) {
-    const uint8x8_t va = vld1_u8(a);
-    const uint8x8_t vb = vld1_u8(b);
-
-    // Subtract zero point.
-    const int16x8_t vxa = vreinterpretq_s16_u16(vsubl_u8(va, va_zero_point));
-    const int16x8_t vxb = vreinterpretq_s16_u16(vsubl_u8(vb, vb_zero_point));
-
-    // Multiply by factors and accumulate products.
-    int32x4_t vacc_lo = vmulq_s32(vmovl_s16(vget_low_s16(vxa)), va_multiplier);
-#if XNN_ARCH_ARM64
-    int32x4_t vacc_hi = vmulq_s32(vmovl_high_s16(vxa), va_multiplier);
-#else
-    int32x4_t vacc_hi = vmulq_s32(vmovl_s16(vget_high_s16(vxa)), va_multiplier);
-#endif
-
-    vacc_lo = vmlaq_s32(vacc_lo, vmovl_s16(vget_low_s16(vxb)), vb_multiplier);
-#if XNN_ARCH_ARM64
-    vacc_hi = vmlaq_s32(vacc_hi, vmovl_high_s16(vxb), vb_multiplier);
-#else
-    vacc_hi = vmlaq_s32(vacc_hi, vmovl_s16(vget_high_s16(vxb)), vb_multiplier);
-#endif
-
-    // Shift right and round.
-    vacc_lo = vsraq_n_s32(vacc_lo, vbicq_s32(vacc_lo, vzero_shift_mask), 31);
-    vacc_hi = vsraq_n_s32(vacc_hi, vbicq_s32(vacc_hi, vzero_shift_mask), 31);
-
-    vacc_lo = vrshlq_s32(vacc_lo, vright_shift);
-    vacc_hi = vrshlq_s32(vacc_hi, vright_shift);
-
-    // Pack, saturate, and add output zero point.
-#if XNN_ARCH_ARM64
-    const int16x8_t vacc = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc_lo), vacc_hi), vy_zero_point);
-#else
-    const int16x8_t vacc = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc_lo), vqmovn_s32(vacc_hi)), vy_zero_point);
-#endif
-
-    uint8x8_t vy = vqmovun_s16(vacc);
-    vy = vmax_u8(vy, vget_low_u8(vy_min));
-    vy = vmin_u8(vy, vget_low_u8(vy_max));
-
-    if (n & (4 * sizeof(uint8_t))) {
-      vst1_lane_u32(__builtin_assume_aligned(y, 1), vreinterpret_u32_u8(vy), 0); y += 4;
-      vy = vext_u8(vy, vy, 4);
-    }
-    if (n & (2 * sizeof(uint8_t))) {
-      vst1_lane_u16(__builtin_assume_aligned(y, 1), vreinterpret_u16_u8(vy), 0); y += 2;
-      vy = vext_u8(vy, vy, 2);
-    }
-    if (n & (1 * sizeof(uint8_t))) {
-      vst1_lane_u8(y, vy, 0);
-    }
-  }
-}
diff --git a/src/qu8-vadd/minmax-scalar.c b/src/qu8-vadd/minmax-scalar.c
deleted file mode 100644
index b38c5e2..0000000
--- a/src/qu8-vadd/minmax-scalar.c
+++ /dev/null
@@ -1,56 +0,0 @@
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <assert.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/vadd.h>
-#include <xnnpack/math.h>
-
-
-void xnn_qu8_vadd_minmax_ukernel__scalar_x1(
-    size_t n,
-    const uint8_t* a,
-    const uint8_t* b,
-    uint8_t* y,
-    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
-{
-  assert(n != 0);
-
-  const int32_t vzero_point_product = params->scalar.zero_point_product;
-  const uint32_t va_multiplier = params->scalar.a_multiplier;
-  const uint32_t vb_multiplier = params->scalar.b_multiplier;
-  const uint32_t vshift = params->scalar.shift;
-  const int32_t vremainder_mask = params->scalar.remainder_mask;
-  const int32_t vremainder_threshold = params->scalar.remainder_threshold;
-  const int32_t vy_zero_point = params->scalar.y_zero_point;
-  const int32_t vy_max = params->scalar.y_max;
-  const int32_t vy_min = params->scalar.y_min;
-
-  do {
-    const int32_t va = (int32_t) (uint32_t) *a++;
-    const int32_t vb = (int32_t) (uint32_t) *b++;
-
-    // Multiply by factors.
-    const int32_t va_product = va * va_multiplier;
-    const int32_t vb_product = vb * vb_multiplier;
-
-    // Accumulate products.
-    const int32_t vacc = vzero_point_product + va_product + vb_product;
-
-    // Shift right and round.
-    const int32_t vremainder = (vacc & vremainder_mask) - (int32_t) (vacc < 0);
-    int32_t vy = asr_s32(vacc, vshift) + (int32_t) (vremainder > vremainder_threshold);
-
-    // Pack, saturate, and add output zero point.
-    vy += vy_zero_point;
-    vy = vy < vy_min ? vy_min : vy;
-    vy = vy > vy_max ? vy_max : vy;
-
-    *y++ = vy;
-
-    n -= sizeof(uint8_t);
-  } while (n != 0);
-}
diff --git a/src/qu8-vadd/minmax-sse2.c b/src/qu8-vadd/minmax-sse2.c
deleted file mode 100644
index 458fa7b..0000000
--- a/src/qu8-vadd/minmax-sse2.c
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) Facebook, Inc. and its affiliates.
-// All rights reserved.
-//
-// Copyright 2019 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-#include <immintrin.h>
-
-#include <xnnpack/common.h>
-#include <xnnpack/vadd.h>
-
-
-void xnn_qu8_vadd_minmax_ukernel__sse2_x8(
-    size_t n,
-    const uint8_t* a,
-    const uint8_t* b,
-    uint8_t* y,
-    const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
-{
-  const __m128i vzero_point_product = _mm_load_si128((const __m128i*) &params->sse2.zero_point_product);
-  const __m128i va_multiplier_lo = _mm_load_si128((const __m128i*) &params->sse2.a_multiplier_lo);
-  const __m128i va_multiplier_hi = _mm_load_si128((const __m128i*) &params->sse2.a_multiplier_hi);
-  const __m128i vb_multiplier_lo = _mm_load_si128((const __m128i*) &params->sse2.b_multiplier_lo);
-  const __m128i vb_multiplier_hi = _mm_load_si128((const __m128i*) &params->sse2.b_multiplier_hi);
-  const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
-  const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
-  const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
-
-  const __m128i vzero = _mm_setzero_si128();
-  for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
-    const __m128i va = _mm_loadl_epi64((const __m128i*) a);
-    a += 8;
-    const __m128i vb = _mm_loadl_epi64((const __m128i*) b);
-    b += 8;
-
-    const __m128i vxa = _mm_unpacklo_epi8(va, vzero);
-    const __m128i vxb = _mm_unpacklo_epi8(vb, vzero);
-
-    // Multiply by factors.
-    const __m128i va_product_lo = _mm_mullo_epi16(vxa, va_multiplier_lo);
-    const __m128i va_product_hi =
-      _mm_add_epi16(_mm_mulhi_epu16(vxa, va_multiplier_lo), _mm_mullo_epi16(vxa, va_multiplier_hi));
-
-    const __m128i vb_product_lo = _mm_mullo_epi16(vxb, vb_multiplier_lo);
-    const __m128i vb_product_hi =
-      _mm_add_epi16(_mm_mulhi_epu16(vxb, vb_multiplier_lo), _mm_mullo_epi16(vxb, vb_multiplier_hi));
-
-    // Accumulate products.
-    __m128i vacc_lo = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(va_product_lo, va_product_hi));
-    __m128i vacc_hi = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(va_product_lo, va_product_hi));
-
-    vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vb_product_lo, vb_product_hi));
-    vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vb_product_lo, vb_product_hi));
-
-    // Shift right and round.
-    const __m128i vrem_lo =
-      _mm_add_epi32(_mm_and_si128(vacc_lo, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo));
-    const __m128i vrem_hi =
-      _mm_add_epi32(_mm_and_si128(vacc_hi, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi));
-
-    vacc_lo = _mm_sub_epi32(_mm_sra_epi32(vacc_lo, vshift), _mm_cmpgt_epi32(vrem_lo, vremainder_threshold));
-    vacc_hi = _mm_sub_epi32(_mm_sra_epi32(vacc_hi, vshift), _mm_cmpgt_epi32(vrem_hi, vremainder_threshold));
-
-    // Pack, saturate, and add output zero point.
-    const __m128i vy_zero_point = _mm_load_si128((const __m128i*) params->sse2.y_zero_point);
-    const __m128i vacc = _mm_adds_epi16(_mm_packs_epi32(vacc_lo, vacc_hi), vy_zero_point);
-    __m128i vy = _mm_packus_epi16(vacc, vacc);
-    vy = _mm_max_epu8(vy, _mm_load_si128((const __m128i*) params->sse2.y_min));
-    vy = _mm_min_epu8(vy, _mm_load_si128((const __m128i*) params->sse2.y_max));
-
-    _mm_storel_epi64((__m128i*) y, vy);
-    y += 8;
-  }
-  if (n != 0) {
-    const __m128i va = _mm_loadl_epi64((const __m128i*) a);
-    const __m128i vb = _mm_loadl_epi64((const __m128i*) b);
-
-    const __m128i vxa = _mm_unpacklo_epi8(va, vzero);
-    const __m128i vxb = _mm_unpacklo_epi8(vb, vzero);
-
-    // Multiply by factors.
-    const __m128i va_product_lo = _mm_mullo_epi16(vxa, va_multiplier_lo);
-    const __m128i va_product_hi =
-      _mm_add_epi16(_mm_mulhi_epu16(vxa, va_multiplier_lo), _mm_mullo_epi16(vxa, va_multiplier_hi));
-
-    const __m128i vb_product_lo = _mm_mullo_epi16(vxb, vb_multiplier_lo);
-    const __m128i vb_product_hi =
-      _mm_add_epi16(_mm_mulhi_epu16(vxb, vb_multiplier_lo), _mm_mullo_epi16(vxb, vb_multiplier_hi));
-
-    // Accumulate products.
-    __m128i vacc_lo = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(va_product_lo, va_product_hi));
-    __m128i vacc_hi = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(va_product_lo, va_product_hi));
-
-    vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vb_product_lo, vb_product_hi));
-    vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vb_product_lo, vb_product_hi));
-
-    // Shift right and round.
-    const __m128i vrem_lo =
-      _mm_add_epi32(_mm_and_si128(vacc_lo, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo));
-    const __m128i vrem_hi =
-      _mm_add_epi32(_mm_and_si128(vacc_hi, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi));
-
-    vacc_lo = _mm_sub_epi32(_mm_sra_epi32(vacc_lo, vshift), _mm_cmpgt_epi32(vrem_lo, vremainder_threshold));
-    vacc_hi = _mm_sub_epi32(_mm_sra_epi32(vacc_hi, vshift), _mm_cmpgt_epi32(vrem_hi, vremainder_threshold));
-
-    // Pack, saturate, and add output zero point.
-    const __m128i vy_zero_point = _mm_load_si128((const __m128i*) params->sse2.y_zero_point);
-    const __m128i vacc = _mm_adds_epi16(_mm_packs_epi32(vacc_lo, vacc_hi), vy_zero_point);
-    __m128i vy = _mm_packus_epi16(vacc, vacc);
-    vy = _mm_max_epu8(vy, _mm_load_si128((const __m128i*) params->sse2.y_min));
-    vy = _mm_min_epu8(vy, _mm_load_si128((const __m128i*) params->sse2.y_max));
-
-    if (n & (4 * sizeof(uint8_t))) {
-      *((uint32_t*) y) = (uint32_t) _mm_cvtsi128_si32(vy);
-      vy = _mm_srli_epi64(vy, 32);
-      y += 4;
-    }
-    if (n & (2 * sizeof(uint8_t))) {
-      *((uint16_t*) y) = (uint16_t) _mm_extract_epi16(vy, 0);
-      vy = _mm_srli_epi32(vy, 16);
-      y += 2;
-    }
-    if (n & (1 * sizeof(uint8_t))) {
-      *((uint8_t*) y) = (uint8_t) _mm_cvtsi128_si32(vy);
-    }
-  }
-}