Evaluation stubs for RNDNU requantization

PiperOrigin-RevId: 376302078
diff --git a/src/qs8-requantization/fp32-scalar-magic.c b/src/qs8-requantization/fp32-scalar-magic.c
index 44aea1a..4b9ec89 100644
--- a/src/qs8-requantization/fp32-scalar-magic.c
+++ b/src/qs8-requantization/fp32-scalar-magic.c
@@ -13,6 +13,7 @@
 
 #include <fp16/bitcasts.h>
 
+#include <xnnpack/math.h>
 #include <xnnpack/requantization-stubs.h>
 
 
@@ -45,10 +46,10 @@
     const float z_scaled = (float) z * scale;
     const float w_scaled = (float) w * scale;
 
-    const float x_clamped = x_scaled < fmin ? fmin : x_scaled > fmax ? fmax : x_scaled;
-    const float y_clamped = y_scaled < fmin ? fmin : y_scaled > fmax ? fmax : y_scaled;
-    const float z_clamped = z_scaled < fmin ? fmin : z_scaled > fmax ? fmax : z_scaled;
-    const float w_clamped = w_scaled < fmin ? fmin : w_scaled > fmax ? fmax : w_scaled;
+    const float x_clamped = math_min_f32(math_max_f32(x_scaled, fmin), fmax);
+    const float y_clamped = math_min_f32(math_max_f32(y_scaled, fmin), fmax);
+    const float z_clamped = math_min_f32(math_max_f32(z_scaled, fmin), fmax);
+    const float w_clamped = math_min_f32(math_max_f32(w_scaled, fmin), fmax);
 
     const int32_t x_biased = (int32_t) fp32_to_bits(x_clamped + fmagic) - imagic;
     const int32_t y_biased = (int32_t) fp32_to_bits(y_clamped + fmagic) - imagic;
diff --git a/src/qs8-requantization/gemmlowp-scalar.c b/src/qs8-requantization/gemmlowp-scalar.c
index b4f66f8..119e81e 100644
--- a/src/qs8-requantization/gemmlowp-scalar.c
+++ b/src/qs8-requantization/gemmlowp-scalar.c
@@ -106,10 +106,10 @@
     const int32_t w_scaled = asr_s32(w_q31product, shift) + (int32_t) (w_remainder > threshold);
 
     // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-    const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
-    const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
-    const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
-    const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
+    const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
+    const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax);
+    const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax);
+    const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax);
 
     // Add zero point to clamped value.
     // The result is guaranteed to be in [qmin, qmax] range.
diff --git a/src/qs8-requantization/rndna-scalar-signed64.c b/src/qs8-requantization/rndna-scalar-signed64.c
index cf377a2..d04dc79 100644
--- a/src/qs8-requantization/rndna-scalar-signed64.c
+++ b/src/qs8-requantization/rndna-scalar-signed64.c
@@ -71,16 +71,16 @@
     const int32_t w_scaled = (int32_t) asr_s64(w_adjusted_product + rounding, shift);
 
     // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-    const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
-    const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
-    const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
-    const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
+    const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
+    const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax);
+    const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax);
+    const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax);
 
     // Add zero point to clamped value.
     // The result is guaranteed to be in [qmin, qmax] range.
     //
     // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
+    // range, so addition of zero point (which can be up to 127) can overflow signed 32-bit integer.
     const int32_t x_biased = x_clamped + zero_point;
     const int32_t y_biased = y_clamped + zero_point;
     const int32_t z_biased = z_clamped + zero_point;
diff --git a/src/qs8-requantization/rndna-scalar-unsigned32.c b/src/qs8-requantization/rndna-scalar-unsigned32.c
index 675ed51..db66a3d 100644
--- a/src/qs8-requantization/rndna-scalar-unsigned32.c
+++ b/src/qs8-requantization/rndna-scalar-unsigned32.c
@@ -106,16 +106,16 @@
     const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled);
 
     // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-    const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
-    const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
-    const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
-    const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
+    const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
+    const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax);
+    const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax);
+    const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax);
 
     // Add zero point to clamped value.
     // The result is guaranteed to be in [qmin, qmax] range.
     //
     // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
+    // range, so addition of zero point (which can be up to 127) can overflow signed 32-bit integer.
     const int32_t x_biased = x_clamped + zero_point;
     const int32_t y_biased = y_clamped + zero_point;
     const int32_t z_biased = z_clamped + zero_point;
diff --git a/src/qs8-requantization/rndna-scalar-unsigned64.c b/src/qs8-requantization/rndna-scalar-unsigned64.c
index ddcb9a8..621fa5b 100644
--- a/src/qs8-requantization/rndna-scalar-unsigned64.c
+++ b/src/qs8-requantization/rndna-scalar-unsigned64.c
@@ -78,16 +78,16 @@
     const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled);
 
     // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
-    const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
-    const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
-    const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
-    const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
+    const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
+    const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax);
+    const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax);
+    const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax);
 
     // Add zero point to clamped value.
     // The result is guaranteed to be in [qmin, qmax] range.
     //
     // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
-    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
+    // range, so addition of zero point (which can be up to 127) can overflow signed 32-bit integer.
     const int32_t x_biased = x_clamped + zero_point;
     const int32_t y_biased = y_clamped + zero_point;
     const int32_t z_biased = z_clamped + zero_point;
diff --git a/src/qs8-requantization/rndnu-neon.c b/src/qs8-requantization/rndnu-neon.c
new file mode 100644
index 0000000..96b0df9
--- /dev/null
+++ b/src/qs8-requantization/rndnu-neon.c
@@ -0,0 +1,133 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_rndnu__neon(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
+  const int32_t shift = 127 + 23 - (scale_bits >> 23);
+  assert(shift >= 24);
+  assert(shift < 56);
+
+#if defined(__aarch64__)
+  const int32x4_t vmultiplier = vdupq_n_s32(multiplier);
+#else
+  const int32x2_t vmultiplier = vdup_n_s32(multiplier);
+#endif
+  const int16x8_t vzero_point = vdupq_n_s16((int16_t) zero_point);
+  const int64x2_t vshift = vdupq_n_s64(-shift);
+  const int8x16_t vqmin = vdupq_n_s8(qmin);
+  const int8x16_t vqmax = vdupq_n_s8(qmax);
+  for (; n != 0; n -= 16) {
+    const int32x4_t x = vld1q_s32(input);
+    const int32x4_t y = vld1q_s32(input + 4);
+    const int32x4_t z = vld1q_s32(input + 8);
+    const int32x4_t w = vld1q_s32(input + 12);
+    input += 16;
+
+#if defined(__aarch64__)
+    const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vget_low_s32(vmultiplier));
+    const int64x2_t x23_product = vmull_high_s32(x, vmultiplier);
+    const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vget_low_s32(vmultiplier));
+    const int64x2_t y23_product = vmull_high_s32(y, vmultiplier);
+    const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vget_low_s32(vmultiplier));
+    const int64x2_t z23_product = vmull_high_s32(z, vmultiplier);
+    const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vget_low_s32(vmultiplier));
+    const int64x2_t w23_product = vmull_high_s32(w, vmultiplier);
+#else
+    const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vmultiplier);
+    const int64x2_t x23_product = vmull_s32(vget_high_s32(x), vmultiplier);
+    const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vmultiplier);
+    const int64x2_t y23_product = vmull_s32(vget_high_s32(y), vmultiplier);
+    const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vmultiplier);
+    const int64x2_t z23_product = vmull_s32(vget_high_s32(z), vmultiplier);
+    const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vmultiplier);
+    const int64x2_t w23_product = vmull_s32(vget_high_s32(w), vmultiplier);
+#endif
+
+    const int64x2_t x01_scaled = vrshlq_s64(x01_product, vshift);
+    const int64x2_t x23_scaled = vrshlq_s64(x23_product, vshift);
+    const int64x2_t y01_scaled = vrshlq_s64(y01_product, vshift);
+    const int64x2_t y23_scaled = vrshlq_s64(y23_product, vshift);
+    const int64x2_t z01_scaled = vrshlq_s64(z01_product, vshift);
+    const int64x2_t z23_scaled = vrshlq_s64(z23_product, vshift);
+    const int64x2_t w01_scaled = vrshlq_s64(w01_product, vshift);
+    const int64x2_t w23_scaled = vrshlq_s64(w23_product, vshift);
+
+#ifdef __aarch64__
+    const int32x4_t x_scaled = vuzp1q_s32(vreinterpretq_s32_s64(x01_scaled), vreinterpretq_s32_s64(x23_scaled));
+    const int32x4_t y_scaled = vuzp1q_s32(vreinterpretq_s32_s64(y01_scaled), vreinterpretq_s32_s64(y23_scaled));
+    const int32x4_t z_scaled = vuzp1q_s32(vreinterpretq_s32_s64(z01_scaled), vreinterpretq_s32_s64(z23_scaled));
+    const int32x4_t w_scaled = vuzp1q_s32(vreinterpretq_s32_s64(w01_scaled), vreinterpretq_s32_s64(w23_scaled));
+
+    const int16x8_t xy_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(x_scaled), y_scaled), vzero_point);
+    const int16x8_t zw_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(z_scaled), w_scaled), vzero_point);
+    const int8x16_t xyzw_packed = vqmovn_high_s16(vqmovn_s16(xy_packed), zw_packed);
+#else
+    const int32x4_t x_scaled = vcombine_s32(vmovn_s64(x01_scaled), vmovn_s64(x23_scaled));
+    const int32x4_t y_scaled = vcombine_s32(vmovn_s64(y01_scaled), vmovn_s64(y23_scaled));
+    const int32x4_t z_scaled = vcombine_s32(vmovn_s64(z01_scaled), vmovn_s64(z23_scaled));
+    const int32x4_t w_scaled = vcombine_s32(vmovn_s64(w01_scaled), vmovn_s64(w23_scaled));
+
+    const int16x8_t xy_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(x_scaled), vqmovn_s32(y_scaled)), vzero_point);
+    const int16x8_t zw_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(z_scaled), vqmovn_s32(w_scaled)), vzero_point);
+    const int8x16_t xyzw_packed = vcombine_s8(vqmovn_s16(xy_packed), vqmovn_s16(zw_packed));
+#endif
+
+    const int8x16_t xyzw_clamped = vmaxq_s8(vminq_s8(xyzw_packed, vqmax), vqmin);
+
+    // AArch32 version:
+    //   8x VMULL.S32 Qd, Dm, Dn
+    //   8x VRSHL.S32 Qd, Qm, Qn
+    //   8x VMOVN.S64 Dd, Qm
+    //   4x VQMOVN.S32 Dd, Qm
+    //   2x VADD.S16 Qd, Qm, Qn
+    //   2x VQMOVUN.S16 Dd, Qm
+    //   1x VMAX.U8 Qd, Qm, Qn
+    //   1x VMIN.U8 Qd, Qm, Qn
+    // ---------------------
+    // 34 instructions total
+    //
+    // AArch64 version:
+    //   4x SMULL Vd.2D, Vn.2S, Vm.2S
+    //   4x SMULL2 Vd.2D, Vn.4S, Vm.4S
+    //   8x SRSHL Vd.2D, Vn.2D, Vm.2D
+    //   4x UZP1 Vd.4S, Vn.4S, Vm.4S
+    //   2x SQXTN Vd.4H, Vn.4S
+    //   2x SQXTN2 Vd.8H, Vn.4S
+    //   2x ADD Vd.8H, Vn.8H, Vm.8H
+    //   1x SQXTN Vd.8B, Vn.8H
+    //   1x SQXTN2 Vd.16B, Vn.8H
+    //   1x SMIN Vd.16B, Vn.16B, Vm.16B
+    //   1x SMAX Vd.16B, Vn.16B, Vm.16B
+    // ---------------------
+    // 30 instructions total
+
+    vst1q_s8(output, xyzw_clamped);
+    output += 16;
+  }
+}
diff --git a/src/qs8-requantization/rndnu-scalar.c b/src/qs8-requantization/rndnu-scalar.c
new file mode 100644
index 0000000..eafc7e7
--- /dev/null
+++ b/src/qs8-requantization/rndnu-scalar.c
@@ -0,0 +1,89 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/math.h>
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_rndnu__scalar(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 4 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
+  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+  assert(shift >= 24);
+  assert(shift < 56);
+
+  const int64_t rounding = INT64_C(1) << (shift - 1);
+  const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
+  const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
+  for (; n != 0; n -= 4) {
+    const int32_t x = input[0];
+    const int32_t y = input[1];
+    const int32_t z = input[2];
+    const int32_t w = input[3];
+    input += 4;
+
+    // Compute full 64-bit product of signed 32-bit factors.
+    //
+    // Note: multiplier can be treated as either signed or unsigned.
+    const int64_t x_product = (int64_t) x * (int64_t) multiplier;
+    const int64_t y_product = (int64_t) y * (int64_t) multiplier;
+    const int64_t z_product = (int64_t) z * (int64_t) multiplier;
+    const int64_t w_product = (int64_t) w * (int64_t) multiplier;
+
+    // Arithmetically shift the full 64-bit product right with rounding.
+    // Rounding is performed towards closest integer, with midpoints rounded up.
+    //
+    // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
+    // "right shift with rounding" instruction each line below can be represented by just one such instruction
+    // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD).
+    const int32_t x_scaled = (int32_t) asr_s64(x_product + rounding, shift);
+    const int32_t y_scaled = (int32_t) asr_s64(y_product + rounding, shift);
+    const int32_t z_scaled = (int32_t) asr_s64(z_product + rounding, shift);
+    const int32_t w_scaled = (int32_t) asr_s64(w_product + rounding, shift);
+
+    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
+    const int32_t x_clamped = math_min_s32(math_max_s32(x_scaled, smin), smax);
+    const int32_t y_clamped = math_min_s32(math_max_s32(y_scaled, smin), smax);
+    const int32_t z_clamped = math_min_s32(math_max_s32(z_scaled, smin), smax);
+    const int32_t w_clamped = math_min_s32(math_max_s32(w_scaled, smin), smax);
+
+    // Add zero point to clamped value.
+    // The result is guaranteed to be in [qmin, qmax] range.
+    //
+    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+    // range, so addition of zero point (which can be up to 127) can overflow signed 32-bit integer.
+    const int32_t x_biased = x_clamped + zero_point;
+    const int32_t y_biased = y_clamped + zero_point;
+    const int32_t z_biased = z_clamped + zero_point;
+    const int32_t w_biased = w_clamped + zero_point;
+
+    output[0] = (int8_t) x_biased;
+    output[1] = (int8_t) y_biased;
+    output[2] = (int8_t) z_biased;
+    output[3] = (int8_t) w_biased;
+    output += 4;
+  }
+}
diff --git a/src/qs8-requantization/rndnu-sse4.c b/src/qs8-requantization/rndnu-sse4.c
new file mode 100644
index 0000000..9d08881
--- /dev/null
+++ b/src/qs8-requantization/rndnu-sse4.c
@@ -0,0 +1,108 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <smmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_rndnu__sse4(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const int32_t multiplier = ((int32_t) (scale_bits << 7) & INT32_C(0x3FFFF800)) | INT32_C(0x40000000);
+  const uint32_t shift = 127 + 30 - (scale_bits >> 23);
+  assert(shift >= 31);
+  assert(shift < 63);
+  const int64_t rounding = INT64_C(1) << (shift - 1);
+
+  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+  const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+  const __m128i vqmin = _mm_set1_epi8((char) qmin);
+  const __m128i vqmax = _mm_set1_epi8((char) qmax);
+  const __m128i vshift = _mm_cvtsi32_si128((int) (shift - 31));
+  const __m128i vrounding = _mm_set1_epi64x(rounding);
+  for (; n != 0; n -= 16) {
+    const __m128i x = _mm_loadu_si128((const __m128i*) input);
+    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+    input += 16;
+
+    const __m128i x_odd = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128i y_odd = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128i z_odd = _mm_shuffle_epi32(z, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128i w_odd = _mm_shuffle_epi32(w, _MM_SHUFFLE(3, 3, 1, 1));
+
+    const __m128i x_product02 = _mm_mul_epi32(x, vmultiplier);
+    const __m128i y_product02 = _mm_mul_epi32(y, vmultiplier);
+    const __m128i z_product02 = _mm_mul_epi32(z, vmultiplier);
+    const __m128i w_product02 = _mm_mul_epi32(w, vmultiplier);
+
+    const __m128i x_product13 = _mm_mul_epi32(x_odd, vmultiplier);
+    const __m128i y_product13 = _mm_mul_epi32(y_odd, vmultiplier);
+    const __m128i z_product13 = _mm_mul_epi32(z_odd, vmultiplier);
+    const __m128i w_product13 = _mm_mul_epi32(w_odd, vmultiplier);
+
+    const __m128i x_prescaled02 = _mm_srli_epi64(_mm_add_epi64(x_product02, vrounding), 31);
+    const __m128i x_prescaled13 = _mm_slli_epi64(_mm_add_epi64(x_product13, vrounding), 1);
+    const __m128i y_prescaled02 = _mm_srli_epi64(_mm_add_epi64(y_product02, vrounding), 31);
+    const __m128i y_prescaled13 = _mm_slli_epi64(_mm_add_epi64(y_product13, vrounding), 1);
+    const __m128i z_prescaled02 = _mm_srli_epi64(_mm_add_epi64(z_product02, vrounding), 31);
+    const __m128i z_prescaled13 = _mm_slli_epi64(_mm_add_epi64(z_product13, vrounding), 1);
+    const __m128i w_prescaled02 = _mm_srli_epi64(_mm_add_epi64(w_product02, vrounding), 31);
+    const __m128i w_prescaled13 = _mm_slli_epi64(_mm_add_epi64(w_product13, vrounding), 1);
+
+    const __m128i x_prescaled = _mm_blend_epi16(x_prescaled02, x_prescaled13, 0xCC);
+    const __m128i y_prescaled = _mm_blend_epi16(y_prescaled02, y_prescaled13, 0xCC);
+    const __m128i z_prescaled = _mm_blend_epi16(z_prescaled02, z_prescaled13, 0xCC);
+    const __m128i w_prescaled = _mm_blend_epi16(w_prescaled02, w_prescaled13, 0xCC);
+
+    const __m128i x_scaled = _mm_sra_epi32(x_prescaled, vshift);
+    const __m128i y_scaled = _mm_sra_epi32(y_prescaled, vshift);
+    const __m128i z_scaled = _mm_sra_epi32(z_prescaled, vshift);
+    const __m128i w_scaled = _mm_sra_epi32(w_prescaled, vshift);
+
+    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+    const __m128i xyzw_packed = _mm_packs_epi16(xy_packed, zw_packed);
+    const __m128i xyzw_clamped = _mm_max_epi8(_mm_min_epi8(xyzw_packed, vqmax), vqmin);
+
+    // 4x PABSD
+    // 4x PSHUFD
+    // 8x PMULUDQ
+    // 4x PSRLQ
+    // 4x PSRLD
+    // 8x PADDQ
+    // 4x PBLENDW
+    // 4x PSIGND
+    // 2x PACKSSDW
+    // 2x PADDSW
+    // 1x PACKSSWB
+    // 1x PMAXSB
+    // 1x PMINSB
+    // ---------------------
+    // 47 instructions total
+
+    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+    output += 16;
+  }
+}