Signed requantization evaluation stubs and unit tests PiperOrigin-RevId: 323887449

commit: 2e23d2b0a9d7c2ced3fc727f05f48fbc765ff3c4 [log] [tgz]
author: Marat Dukhan <maratek@google.com> Wed Jul 29 16:01:37 2020 -0700
committer: XNNPACK Team <xnnpack-github-robot@google.com> Wed Jul 29 16:02:28 2020 -0700
tree: 60f5238cbff9c00beb31e797029a3839ea6b21ac
parent: c5045bfda469c2d322a726ecf242a25c365ae39d [diff]
diff --git a/BUILD.bazel b/BUILD.bazel
index f98fb51..8ffbba8 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel

@@ -358,6 +358,12 @@
     "src/math/sigmoid-scalar-lut2048-p1-div.c",
     "src/math/sigmoid-scalar-lut64-p2-div.c",
     "src/math/sigmoid-scalar-p5-div.c",
+    "src/qs8-requantization/fp32-scalar-lrintf.c",
+    "src/qs8-requantization/fp32-scalar-magic.c",
+    "src/qs8-requantization/precise-scalar-signed64.c",
+    "src/qs8-requantization/precise-scalar-unsigned32.c",
+    "src/qs8-requantization/precise-scalar-unsigned64.c",
+    "src/qs8-requantization/q31-scalar.c",
     "src/qu8-avgpool/9p8x-minmax-scalar-c1.c",
     "src/qu8-avgpool/9x-minmax-scalar-c1.c",
     "src/qu8-dwconv/up1x9-minmax-scalar.c",
@@ -842,6 +848,7 @@
     "src/f32-vunary/gen/vneg-wasmsimd-x8.c",
     "src/f32-vunary/gen/vsqr-wasmsimd-x4.c",
     "src/f32-vunary/gen/vsqr-wasmsimd-x8.c",
+    "src/qs8-requantization/fp32-wasmsimd.c",
     "src/qu8-requantization/fp32-wasmsimd.c",
     "src/x32-fill/wasmsimd.c",
     "src/x32-packx/x4-wasmsimd.c",
@@ -874,6 +881,8 @@
 ]
 
 PSIMD_ACCMATH_UKERNELS = [
+    "src/qs8-requantization/precise-psimd.c",
+    "src/qs8-requantization/fp32-psimd.c",
     "src/qu8-requantization/precise-psimd.c",
     "src/qu8-requantization/fp32-psimd.c",
 ]
@@ -1066,6 +1075,9 @@
     "src/f32-vunary/gen/vneg-neon-x8.c",
     "src/f32-vunary/gen/vsqr-neon-x4.c",
     "src/f32-vunary/gen/vsqr-neon-x8.c",
+    "src/qs8-requantization/precise-neon.c",
+    "src/qs8-requantization/fp32-neon.c",
+    "src/qs8-requantization/q31-neon.c",
     "src/qu8-avgpool/9p8x-minmax-neon-c8.c",
     "src/qu8-avgpool/9x-minmax-neon-c8.c",
     "src/qu8-dwconv/up8x9-minmax-neon.c",
@@ -1625,6 +1637,9 @@
     "src/f32-vrnd/gen/vrndu-sse2-x8.c",
     "src/f32-vrnd/gen/vrndd-sse2-x4.c",
     "src/f32-vrnd/gen/vrndd-sse2-x8.c",
+    "src/qs8-requantization/precise-sse2.c",
+    "src/qs8-requantization/fp32-sse2.c",
+    "src/qs8-requantization/q31-sse2.c",
     "src/qu8-avgpool/9p8x-minmax-sse2-c8.c",
     "src/qu8-avgpool/9x-minmax-sse2-c8.c",
     "src/qu8-igemm/4x4c2-minmax-sse2.c",
@@ -1659,6 +1674,8 @@
 ]
 
 SSSE3_UKERNELS = [
+    "src/qs8-requantization/precise-ssse3.c",
+    "src/qs8-requantization/q31-ssse3.c",
     "src/qu8-requantization/precise-ssse3.c",
     "src/qu8-requantization/q31-ssse3.c",
 ]
@@ -1682,6 +1699,9 @@
     "src/f32-vrnd/gen/vrndu-sse41-x8.c",
     "src/f32-vrnd/gen/vrndd-sse41-x4.c",
     "src/f32-vrnd/gen/vrndd-sse41-x8.c",
+    "src/qs8-requantization/fp32-sse4.c",
+    "src/qs8-requantization/precise-sse4.c",
+    "src/qs8-requantization/q31-sse4.c",
     "src/qu8-requantization/precise-sse4.c",
     "src/qu8-requantization/q31-sse4.c",
     "src/math/roundne-sse41.c",
@@ -4971,6 +4991,16 @@
 )
 
 xnnpack_unit_test(
+    name = "qs8_requantization_test",
+    srcs = [
+        "src/xnnpack/requantization-stubs.h",
+        "test/qs8-requantization.cc",
+        "test/requantization-tester.h",
+    ] + MICROKERNEL_TEST_HDRS,
+    deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
     name = "qu8_avgpool_minmax_test",
     srcs = [
         "test/qu8-avgpool-minmax.cc",

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3bb9cef..53a4f9f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -481,6 +481,12 @@
   src/math/sigmoid-scalar-lut2048-p1-div.c
   src/math/sigmoid-scalar-lut64-p2-div.c
   src/math/sigmoid-scalar-p5-div.c
+  src/qs8-requantization/fp32-scalar-lrintf.c
+  src/qs8-requantization/fp32-scalar-magic.c
+  src/qs8-requantization/precise-scalar-signed64.c
+  src/qs8-requantization/precise-scalar-unsigned32.c
+  src/qs8-requantization/precise-scalar-unsigned64.c
+  src/qs8-requantization/q31-scalar.c
   src/qu8-avgpool/9p8x-minmax-scalar-c1.c
   src/qu8-avgpool/9x-minmax-scalar-c1.c
   src/qu8-dwconv/up1x9-minmax-scalar.c
@@ -528,6 +534,8 @@
   src/f32-spmm/gen/16x1-minmax-psimd.c)
 
 SET(XNNPACK_PSIMD_ACCMATH_MICROKERNEL_SRCS
+  src/qs8-requantization/fp32-psimd.c
+  src/qs8-requantization/precise-psimd.c
   src/qu8-requantization/fp32-psimd.c
   src/qu8-requantization/precise-psimd.c)
 
@@ -718,6 +726,9 @@
   src/f32-vunary/gen/vneg-neon-x8.c
   src/f32-vunary/gen/vsqr-neon-x4.c
   src/f32-vunary/gen/vsqr-neon-x8.c
+  src/qs8-requantization/precise-neon.c
+  src/qs8-requantization/fp32-neon.c
+  src/qs8-requantization/q31-neon.c
   src/qu8-avgpool/9p8x-minmax-neon-c8.c
   src/qu8-avgpool/9x-minmax-neon-c8.c
   src/qu8-dwconv/up8x9-minmax-neon.c
@@ -1271,6 +1282,9 @@
   src/f32-vrnd/gen/vrndu-sse2-x8.c
   src/f32-vrnd/gen/vrndd-sse2-x4.c
   src/f32-vrnd/gen/vrndd-sse2-x8.c
+  src/qs8-requantization/fp32-sse2.c
+  src/qs8-requantization/precise-sse2.c
+  src/qs8-requantization/q31-sse2.c
   src/qu8-avgpool/9p8x-minmax-sse2-c8.c
   src/qu8-avgpool/9x-minmax-sse2-c8.c
   src/qu8-igemm/4x4c2-minmax-sse2.c
@@ -1304,6 +1318,8 @@
   src/math/sigmoid-sse2-p5-div.c)
 
 SET(XNNPACK_SSSE3_MICROKERNEL_SRCS
+  src/qs8-requantization/precise-ssse3.c
+  src/qs8-requantization/q31-ssse3.c
   src/qu8-requantization/precise-ssse3.c
   src/qu8-requantization/q31-ssse3.c)
 
@@ -1326,6 +1342,9 @@
   src/f32-vrnd/gen/vrndu-sse41-x8.c
   src/f32-vrnd/gen/vrndd-sse41-x4.c
   src/f32-vrnd/gen/vrndd-sse41-x8.c
+  src/qs8-requantization/fp32-sse4.c
+  src/qs8-requantization/precise-sse4.c
+  src/qs8-requantization/q31-sse4.c
   src/qu8-requantization/precise-sse4.c
   src/qu8-requantization/q31-sse4.c
   src/math/roundne-sse41.c

diff --git a/src/qs8-requantization/fp32-neon.c b/src/qs8-requantization/fp32-neon.c
new file mode 100644
index 0000000..448cf25
--- /dev/null
+++ b/src/qs8-requantization/fp32-neon.c

@@ -0,0 +1,134 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__neon(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const float32x4_t vscale = vdupq_n_f32(scale);
+#ifdef __aarch64__
+  const int16x8_t vzero_point = vdupq_n_s16((int16_t) zero_point);
+  const int8x16_t vqmin = vdupq_n_s8(qmin);
+  const int8x16_t vqmax = vdupq_n_s8(qmax);
+#else
+  const float32x4_t vfmin = vdupq_n_f32((float) ((int32_t) qmin - (int32_t) zero_point));
+  const float32x4_t vfmax = vdupq_n_f32((float) ((int32_t) qmax - (int32_t) zero_point));
+  const float32x4_t vfmagic = vdupq_n_f32(12582912.0f);
+  const int32x4_t vimagic = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) zero_point);
+#endif
+  for (; n != 0; n -= 16) {
+    const int32x4_t x = vld1q_s32(input);
+    const int32x4_t y = vld1q_s32(input + 4);
+    const int32x4_t z = vld1q_s32(input + 8);
+    const int32x4_t w = vld1q_s32(input + 12);
+    input += 16;
+
+    // Convert int32_t input to FP32 and multiply by FP32 scale.
+    // Both operations involve statistically unbiased roundings:
+    // - Large int32_t values can't be exactly represented as FP32. The conversion instruction in ARM NEON would
+    //   round it to nearest FP32 value with ties to even.
+    // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+    //   to nearest FP32 value with ties to even.
+    const float32x4_t x_scaled = vmulq_f32(vcvtq_f32_s32(x), vscale);
+    const float32x4_t y_scaled = vmulq_f32(vcvtq_f32_s32(y), vscale);
+    const float32x4_t z_scaled = vmulq_f32(vcvtq_f32_s32(z), vscale);
+    const float32x4_t w_scaled = vmulq_f32(vcvtq_f32_s32(w), vscale);
+
+#ifdef __aarch64__
+    // Leverage "Floating-point Convert to Signed integer, rouding to nearest with ties to even" instruction.
+    // This is an ARMv8 instruction (always available in AArch64), which saturates result on overflow.
+    // We don't need to specifically consider saturated results, they will be clamped at the last stage.
+    const int32x4_t x_rounded = vcvtnq_s32_f32(x_scaled);
+    const int32x4_t y_rounded = vcvtnq_s32_f32(y_scaled);
+    const int32x4_t z_rounded = vcvtnq_s32_f32(z_scaled);
+    const int32x4_t w_rounded = vcvtnq_s32_f32(w_scaled);
+
+    // Standard final sequence on ARM NEON:
+    // - Pack to int16_t and saturate
+    // - Add zero point
+    // - Pack to uint8_t and saturate
+    // - Clamp between qmin and qmax
+    const int16x8_t xy_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(x_rounded), y_rounded), vzero_point);
+    const int16x8_t zw_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(z_rounded), w_rounded), vzero_point);
+    const int8x16_t xyzw_packed = vqmovn_high_s16(vqmovn_s16(xy_packed), zw_packed);
+    const int8x16_t xyzw_clamped = vmaxq_s8(vminq_s8(xyzw_packed, vqmax), vqmin);
+
+    // AArch32 version:
+    //   4x VCVT.F32.S32 Qd, Qm
+    //   4x VMUL.F32 Qd, Qm, Qn
+    //   4x VMIN.F32 Qd, Qm, Qn
+    //   4x VMAX.F32 Qd, Qm, Qn
+    //   4x VADD.F32 Qd, Qm, Qn
+    //   4x VSUB.S32 Qd, Qm, Qn
+    //   4x VMOVN.I32 Dd, Qm
+    //   2x VMOVN.I16 Dd, Qm
+    // ---------------------
+    // 30 instructions total
+    vst1q_s8(output, xyzw_clamped); output += 16;
+#else
+    // AArch64 version:
+    //   4x SCVTF Vd.4S, Vn.4S
+    //   4x FMUL Vd.4S, Vn.4S, Vm.4S
+    //   4x FCVTNS Vd.4S, Vn.4S
+    //   2x SQXTN Vd.4H, Vn.4S
+    //   2x SQXTN2 Vd.8H, Vn.4S
+    //   2x SQADD Vd.8H, Vn.8H, Vm.8H
+    //   1x SQXTN Vd.8B, Vn.8H
+    //   1x SQXTN2 Vd.16B, Vn.8H
+    //   1x SMIN Vd.16B, Vn.16B, Vm.16B
+    //   1x SMAX Vd.16B, Vn.16B, Vm.16B
+    // ---------------------
+    // 22 instructions total
+
+    // ARMv7 NEON offers only a floating-point to integer conversion instruction with rounding towards zero.
+    // In lieu of conversion instruction with rounding-to-nearest-even, we use a magic trick of adding a large
+    // number (1.5 * 2**23) to scaled value to cause rounding to integer, and then substracing this magic number as
+    // integer. This trick works only in a limited range (absolute value of input must be less than 2**22), so
+    // generally we have to clamp input to this range before using the magic. However, clamping to any smaller range
+    // works just as well, and thus we clamp to [qmin - zero point, qmax - zero point] range so that after we add
+    // zero point to the result, it gets into target [qmin, qmax] range.
+    const float32x4_t x_clamped = vminq_f32(vmaxq_f32(x_scaled, vfmin), vfmax);
+    const float32x4_t y_clamped = vminq_f32(vmaxq_f32(y_scaled, vfmin), vfmax);
+    const float32x4_t z_clamped = vminq_f32(vmaxq_f32(z_scaled, vfmin), vfmax);
+    const float32x4_t w_clamped = vminq_f32(vmaxq_f32(w_scaled, vfmin), vfmax);
+
+    // Conversion to integer using the "magic trick". Rounding is performed in the output of addition operation,
+    // and result is rounded to nearest even integer with ties to even.
+    const int32x4_t x_biased = vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(x_clamped, vfmagic)), vimagic);
+    const int32x4_t y_biased = vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(y_clamped, vfmagic)), vimagic);
+    const int32x4_t z_biased = vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(z_clamped, vfmagic)), vimagic);
+    const int32x4_t w_biased = vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(w_clamped, vfmagic)), vimagic);
+
+    // Select low 8 bits of each 32-bit integer in the vectors for the output.
+    // Since result is already clamped to [qmin, qmax] subrange of [0, 255], saturation is not needed.
+    const int16x8_t xy_packed = vcombine_s16(vmovn_s32(x_biased), vmovn_s32(y_biased));
+    const int16x8_t zw_packed = vcombine_s16(vmovn_s32(z_biased), vmovn_s32(w_biased));
+    const int8x16_t xyzw_packed = vcombine_s8(vmovn_s16(xy_packed), vmovn_s16(zw_packed));
+
+    vst1q_s8(output, xyzw_packed); output += 16;
+#endif
+  }
+}

diff --git a/src/qs8-requantization/fp32-psimd.c b/src/qs8-requantization/fp32-psimd.c
new file mode 100644
index 0000000..21958a1
--- /dev/null
+++ b/src/qs8-requantization/fp32-psimd.c

@@ -0,0 +1,84 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <psimd.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__psimd(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const psimd_f32 vscale = psimd_splat_f32(scale);
+  const psimd_f32 vfmin = psimd_splat_f32((float) ((int32_t) qmin - (int32_t) zero_point));
+  const psimd_f32 vfmax = psimd_splat_f32((float) ((int32_t) qmax - (int32_t) zero_point));
+  const psimd_f32 vfmagic = psimd_splat_f32(12582912.0f);
+  const psimd_s32 vimagic = psimd_splat_s32(INT32_C(0x4B400000) - (int32_t) zero_point);
+  for (; n != 0; n -= 16) {
+    const psimd_s32 x = psimd_load_s32(input);
+    const psimd_s32 y = psimd_load_s32(input + 4);
+    const psimd_s32 z = psimd_load_s32(input + 8);
+    const psimd_s32 w = psimd_load_s32(input + 12);
+    input += 16;
+
+    // Convert int32_t input to FP32 and multiply by FP32 scale.
+    // Both operations involve roundings:
+    // - Large int32_t values can't be exactly represented as FP32. We expect that conversion instruction would
+    //   round it to nearest FP32 value with ties to even, but Clang documentation for __builtin_convertvector does
+    //   not guaratee that.
+    // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+    //   to nearest FP32 value with ties to even.
+    const psimd_f32 x_scaled = psimd_cvt_s32_f32(x) * vscale;
+    const psimd_f32 y_scaled = psimd_cvt_s32_f32(y) * vscale;
+    const psimd_f32 z_scaled = psimd_cvt_s32_f32(z) * vscale;
+    const psimd_f32 w_scaled = psimd_cvt_s32_f32(w) * vscale;
+
+    // Clang/gcc vector extension does not provide an intrinsics for a floating-point to integer conversion
+    // operation with rounding-to-nearest-even. In lieu of such intrinsic, we use a magic trick of adding a large
+    // number (1.5 * 2**23) to scaled value to cause rounding to integer, and then substracing this magic number as
+    // integer. This trick works only in a limited range (absolute value of input must be less than 2**22), so
+    // generally we have to clamp input to this range before using the magic. However, clamping to any smaller range
+    // works just as well, and thus we clamp to [qmin - zero point, qmax - zero point] range so that after we add
+    // zero point to the result, it gets into target [qmin, qmax] range.
+    const psimd_f32 x_clamped = psimd_min_f32(psimd_max_f32(x_scaled, vfmin), vfmax);
+    const psimd_f32 y_clamped = psimd_min_f32(psimd_max_f32(y_scaled, vfmin), vfmax);
+    const psimd_f32 z_clamped = psimd_min_f32(psimd_max_f32(z_scaled, vfmin), vfmax);
+    const psimd_f32 w_clamped = psimd_min_f32(psimd_max_f32(w_scaled, vfmin), vfmax);
+
+    // Conversion to integer using the "magic trick". Rounding is performed in the output of addition operation,
+    // and result is rounded to nearest even integer with ties to even.
+    const psimd_s32 x_biased = (psimd_s32) (x_clamped + vfmagic) - vimagic;
+    const psimd_s32 y_biased = (psimd_s32) (y_clamped + vfmagic) - vimagic;
+    const psimd_s32 z_biased = (psimd_s32) (z_clamped + vfmagic) - vimagic;
+    const psimd_s32 w_biased = (psimd_s32) (w_clamped + vfmagic) - vimagic;
+
+    // Select low 8 bits of each 32-bit integer in the vectors for the output.
+    // Since result is already clamped to [qmin, qmax] subrange of [0, 255], saturation is not needed.
+    const psimd_s16 xy_packed = psimd_concat_even_s16((psimd_s16) x_biased, (psimd_s16) y_biased);
+    const psimd_s16 zw_packed = psimd_concat_even_s16((psimd_s16) z_biased, (psimd_s16) w_biased);
+
+    const psimd_s8 xyzw_packed = psimd_concat_even_s8((psimd_s8) xy_packed, (psimd_s8) zw_packed);
+
+    psimd_store_s8(output, xyzw_packed);
+    output += 16;
+  }
+}

diff --git a/src/qs8-requantization/fp32-scalar-lrintf.c b/src/qs8-requantization/fp32-scalar-lrintf.c
new file mode 100644
index 0000000..29937fe
--- /dev/null
+++ b/src/qs8-requantization/fp32-scalar-lrintf.c

@@ -0,0 +1,67 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__scalar_lrintf(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 4 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const long lmin = (long) ((int32_t) qmin - (int32_t) zero_point);
+  const long lmax = (long) ((int32_t) qmax - (int32_t) zero_point);
+  for (; n != 0; n -= 4) {
+    const int32_t x = input[0];
+    const int32_t y = input[1];
+    const int32_t z = input[2];
+    const int32_t w = input[3];
+    input += 4;
+
+    const float x_scaled = (float) x * scale;
+    const float y_scaled = (float) y * scale;
+    const float z_scaled = (float) z * scale;
+    const float w_scaled = (float) w * scale;
+
+    const long x_rounded = lrintf(x_scaled);
+    const long y_rounded = lrintf(y_scaled);
+    const long z_rounded = lrintf(z_scaled);
+    const long w_rounded = lrintf(w_scaled);
+
+    const int32_t x_clamped = (int32_t) (x_rounded < lmin ? lmin : x_rounded > lmax ? lmax : x_rounded);
+    const int32_t y_clamped = (int32_t) (y_rounded < lmin ? lmin : y_rounded > lmax ? lmax : y_rounded);
+    const int32_t z_clamped = (int32_t) (z_rounded < lmin ? lmin : z_rounded > lmax ? lmax : z_rounded);
+    const int32_t w_clamped = (int32_t) (w_rounded < lmin ? lmin : w_rounded > lmax ? lmax : w_rounded);
+
+    const int32_t x_biased = x_clamped + (int32_t) zero_point;
+    const int32_t y_biased = y_clamped + (int32_t) zero_point;
+    const int32_t z_biased = z_clamped + (int32_t) zero_point;
+    const int32_t w_biased = w_clamped + (int32_t) zero_point;
+
+    output[0] = (int8_t) x_biased;
+    output[1] = (int8_t) y_biased;
+    output[2] = (int8_t) z_biased;
+    output[3] = (int8_t) w_biased;
+    output += 4;
+  }
+}

diff --git a/src/qs8-requantization/fp32-scalar-magic.c b/src/qs8-requantization/fp32-scalar-magic.c
new file mode 100644
index 0000000..44aea1a
--- /dev/null
+++ b/src/qs8-requantization/fp32-scalar-magic.c

@@ -0,0 +1,64 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__scalar_magic(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 4 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const float fmin = (float) ((int32_t) qmin - (int32_t) zero_point);
+  const float fmax = (float) ((int32_t) qmax - (int32_t) zero_point);
+  const float fmagic = 12582912.0f;
+  const int32_t imagic = INT32_C(0x4B400000) - (int32_t) zero_point;
+  for (; n != 0; n -= 4) {
+    const int32_t x = input[0];
+    const int32_t y = input[1];
+    const int32_t z = input[2];
+    const int32_t w = input[3];
+    input += 4;
+
+    const float x_scaled = (float) x * scale;
+    const float y_scaled = (float) y * scale;
+    const float z_scaled = (float) z * scale;
+    const float w_scaled = (float) w * scale;
+
+    const float x_clamped = x_scaled < fmin ? fmin : x_scaled > fmax ? fmax : x_scaled;
+    const float y_clamped = y_scaled < fmin ? fmin : y_scaled > fmax ? fmax : y_scaled;
+    const float z_clamped = z_scaled < fmin ? fmin : z_scaled > fmax ? fmax : z_scaled;
+    const float w_clamped = w_scaled < fmin ? fmin : w_scaled > fmax ? fmax : w_scaled;
+
+    const int32_t x_biased = (int32_t) fp32_to_bits(x_clamped + fmagic) - imagic;
+    const int32_t y_biased = (int32_t) fp32_to_bits(y_clamped + fmagic) - imagic;
+    const int32_t z_biased = (int32_t) fp32_to_bits(z_clamped + fmagic) - imagic;
+    const int32_t w_biased = (int32_t) fp32_to_bits(w_clamped + fmagic) - imagic;
+
+    output[0] = (int8_t) x_biased;
+    output[1] = (int8_t) y_biased;
+    output[2] = (int8_t) z_biased;
+    output[3] = (int8_t) w_biased;
+    output += 4;
+  }
+}

diff --git a/src/qs8-requantization/fp32-sse2.c b/src/qs8-requantization/fp32-sse2.c
new file mode 100644
index 0000000..b7518c9
--- /dev/null
+++ b/src/qs8-requantization/fp32-sse2.c

@@ -0,0 +1,94 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__sse2(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const __m128 vscale = _mm_set1_ps(scale);
+  const __m128i vzero_point = _mm_set1_epi16((short) (uint16_t) zero_point);
+  const __m128i vqmin = _mm_set1_epi8((char) qmin);
+  const __m128i vqmax = _mm_set1_epi8((char) qmax);
+  for (; n != 0; n -= 16) {
+    const __m128i x = _mm_loadu_si128((const __m128i*) input);
+    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+    input += 16;
+
+    // Convert int32_t input to FP32 and multiply by FP32 scale.
+    // Both operations involve statistically unbiased roundings (with default MXCSR rounding mode):
+    // - Large int32_t values can't be exactly represented as FP32. CVTDQ2PS instruction on x86 would round it
+    //   according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
+    // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+    //   to nearest FP32 value with ties to even with default MXCSR rounding mode.
+    const __m128 x_scaled = _mm_mul_ps(_mm_cvtepi32_ps(x), vscale);
+    const __m128 y_scaled = _mm_mul_ps(_mm_cvtepi32_ps(y), vscale);
+    const __m128 z_scaled = _mm_mul_ps(_mm_cvtepi32_ps(z), vscale);
+    const __m128 w_scaled = _mm_mul_ps(_mm_cvtepi32_ps(w), vscale);
+
+    // Convert scaled FP32 result to int32_t using CVTPS2DQ instruction from x86 SSE2. CVTPS2DQ instruction rounds
+    // result according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
+    // However, when conversion overflows, it produces INT32_MIN as a result. For large positive inputs the result
+    // of conversion can become negative, which affects the final requantization result. Note that on x86 SSE2 we
+    // have e.g. int32_t(float(INT32_MAX)) == INT32_MIN! This happens because float(INT32_MAX) rounds to 2**31,
+    // which overflows int32_t when it is converted back to integer.
+    //
+    // Thankfully, we can prove that overflow never happens in this requantization scheme. The largest positive
+    // input is INT32_MAX (2**31 - 1), which turns into 2**31 when converted to float. The largest scale value
+    // is 0x1.FFFFFEp-1. When multiplied together, the result is 2147483520 (compare to INT32_MAX = 2147483647),
+    // which fits into int32_t without overflow.
+    const __m128i x_rounded = _mm_cvtps_epi32(x_scaled);
+    const __m128i y_rounded = _mm_cvtps_epi32(y_scaled);
+    const __m128i z_rounded = _mm_cvtps_epi32(z_scaled);
+    const __m128i w_rounded = _mm_cvtps_epi32(w_scaled);
+
+    // Standard final sequence on x86 SSE2:
+    // - Pack to int16_t and saturate
+    // - Add zero point
+    // - Clamp between qmin and qmax
+    // - Pack to int8_t and saturate
+    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_rounded, y_rounded), vzero_point);
+    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_rounded, w_rounded), vzero_point);
+    const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin);
+    const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin);
+    const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped);
+
+    // 4x CVTDQ2PS
+    // 4x MULPS
+    // 4x CVTPS2DQ
+    // 2x PACKSSDW
+    // 2x PADDSW
+    // 2x PMAXSW
+    // 2x PMINSW
+    // 1x PACKSSWB
+    // ---------------------
+    // 21 instructions total
+
+    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+    output += 16;
+  }
+}

diff --git a/src/qs8-requantization/fp32-sse4.c b/src/qs8-requantization/fp32-sse4.c
new file mode 100644
index 0000000..1df6463
--- /dev/null
+++ b/src/qs8-requantization/fp32-sse4.c

@@ -0,0 +1,93 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <nmmintrin.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__sse4(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const __m128 vscale = _mm_set1_ps(scale);
+  const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+  const __m128i vqmin = _mm_set1_epi8((char) qmin);
+  const __m128i vqmax = _mm_set1_epi8((char) qmax);
+  for (; n != 0; n -= 16) {
+    const __m128i x = _mm_loadu_si128((const __m128i*) input);
+    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+    input += 16;
+
+    // Convert int32_t input to FP32 and multiply by FP32 scale.
+    // Both operations involve statistically unbiased roundings (with default MXCSR rounding mode):
+    // - Large int32_t values can't be exactly represented as FP32. CVTDQ2PS instruction on x86 would round it
+    //   according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
+    // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+    //   to nearest FP32 value with ties to even with default MXCSR rounding mode.
+    const __m128 x_scaled = _mm_mul_ps(_mm_cvtepi32_ps(x), vscale);
+    const __m128 y_scaled = _mm_mul_ps(_mm_cvtepi32_ps(y), vscale);
+    const __m128 z_scaled = _mm_mul_ps(_mm_cvtepi32_ps(z), vscale);
+    const __m128 w_scaled = _mm_mul_ps(_mm_cvtepi32_ps(w), vscale);
+
+    // Convert scaled FP32 result to int32_t using CVTPS2DQ instruction from x86 SSE2. CVTPS2DQ instruction rounds
+    // result according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
+    // However, when conversion overflows, it produces INT32_MIN as a result. For large positive inputs the result
+    // of conversion can become negative, which affects the final requantization result. Note that on x86 SSE2 we
+    // have e.g. int32_t(float(INT32_MAX)) == INT32_MIN! This happens because float(INT32_MAX) rounds to 2**31,
+    // which overflows int32_t when it is converted back to integer.
+    //
+    // Thankfully, we can prove that overflow never happens in this requantization scheme. The largest positive
+    // input is INT32_MAX (2**31 - 1), which turns into 2**31 when converted to float. The largest scale value
+    // is 0x1.FFFFFEp-1. When multiplied together, the result is 2147483520 (compare to INT32_MAX = 2147483647),
+    // which fits into int32_t without overflow.
+    const __m128i x_rounded = _mm_cvtps_epi32(x_scaled);
+    const __m128i y_rounded = _mm_cvtps_epi32(y_scaled);
+    const __m128i z_rounded = _mm_cvtps_epi32(z_scaled);
+    const __m128i w_rounded = _mm_cvtps_epi32(w_scaled);
+
+    // Standard final sequence on x86 SSE2:
+    // - Pack to int16_t and saturate
+    // - Add zero point
+    // - Pack to int8_t and saturate
+    // - Clamp between qmin and qmax
+    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_rounded, y_rounded), vzero_point);
+    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_rounded, w_rounded), vzero_point);
+    const __m128i xyzw_packed = _mm_packs_epi16(xy_packed, zw_packed);
+    const __m128i xyzw_clamped = _mm_max_epi8(_mm_min_epi8(xyzw_packed, vqmax), vqmin);
+
+    // 4x CVTDQ2PS
+    // 4x MULPS
+    // 4x CVTPS2DQ
+    // 2x PACKSSDW
+    // 2x PADDSW
+    // 1x PACKSSWB
+    // 1x PMAXSB
+    // 1x PMINSB
+    // ---------------------
+    // 19 instructions total
+
+    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+    output += 16;
+  }
+}

diff --git a/src/qs8-requantization/fp32-wasmsimd.c b/src/qs8-requantization/fp32-wasmsimd.c
new file mode 100644
index 0000000..b7458f8
--- /dev/null
+++ b/src/qs8-requantization/fp32-wasmsimd.c

@@ -0,0 +1,89 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__wasmsimd(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const v128_t vscale = wasm_f32x4_splat(scale);
+  const v128_t vfmin = wasm_f32x4_splat((float) ((int32_t) qmin - (int32_t) zero_point));
+  const v128_t vfmax = wasm_f32x4_splat((float) ((int32_t) qmax - (int32_t) zero_point));
+  const v128_t vfmagic = wasm_f32x4_splat(12582912.0f);
+  const v128_t vimagic = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) zero_point);
+  for (; n != 0; n -= 16) {
+    const v128_t x = wasm_v128_load(input);
+    const v128_t y = wasm_v128_load(input + 4);
+    const v128_t z = wasm_v128_load(input + 8);
+    const v128_t w = wasm_v128_load(input + 12);
+    input += 16;
+
+    // Convert int32_t input to FP32 and multiply by FP32 scale.
+    // Both operations involve statistically unbiased roundings:
+    // - Large int32_t values can't be exactly represented as FP32. The conversion instruction in WAsm SIMD would
+    //   round it to nearest FP32 value with ties to even.
+    // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+    //   to nearest FP32 value with ties to even.
+    const v128_t x_scaled = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(x), vscale);
+    const v128_t y_scaled = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(y), vscale);
+    const v128_t z_scaled = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(z), vscale);
+    const v128_t w_scaled = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(w), vscale);
+
+    // WAsm SIMD offers only a floating-point to integer conversion instruction with rounding towards zero.
+    // In lieu of conversion instruction with rounding-to-nearest-even, we use a magic trick of adding a large
+    // number (1.5 * 2**23) to scaled value to cause rounding to integer, and then substracing this magic number as
+    // integer. This trick works only in a limited range (absolute value of input must be less than 2**22), so
+    // generally we have to clamp input to this range before using the magic. However, clamping to any smaller range
+    // works just as well, and thus we clamp to [qmin - zero point, qmax - zero point] range so that after we add
+    // zero point to the result, it gets into target [qmin, qmax] range.
+    const v128_t x_clamped = wasm_f32x4_min(wasm_f32x4_max(x_scaled, vfmin), vfmax);
+    const v128_t y_clamped = wasm_f32x4_min(wasm_f32x4_max(y_scaled, vfmin), vfmax);
+    const v128_t z_clamped = wasm_f32x4_min(wasm_f32x4_max(z_scaled, vfmin), vfmax);
+    const v128_t w_clamped = wasm_f32x4_min(wasm_f32x4_max(w_scaled, vfmin), vfmax);
+
+    // Conversion to integer using the "magic trick". Rounding is performed in the output of addition operation,
+    // and result is rounded to nearest even integer with ties to even.
+    const v128_t x_biased = wasm_i32x4_sub(wasm_f32x4_add(x_clamped, vfmagic), vimagic);
+    const v128_t y_biased = wasm_i32x4_sub(wasm_f32x4_add(y_clamped, vfmagic), vimagic);
+    const v128_t z_biased = wasm_i32x4_sub(wasm_f32x4_add(z_clamped, vfmagic), vimagic);
+    const v128_t w_biased = wasm_i32x4_sub(wasm_f32x4_add(w_clamped, vfmagic), vimagic);
+
+    // Select low 8 bits of each 32-bit integer in the vectors for the output.
+    // Since result is already clamped to [qmin, qmax] subrange of [0, 255], saturation is not needed.
+    const v128_t xy_packed = wasm_v16x8_shuffle(x_biased, y_biased, 0, 2, 4, 6, 8, 10, 12, 14);
+    const v128_t zw_packed = wasm_v16x8_shuffle(z_biased, w_biased, 0, 2, 4, 6, 8, 10, 12, 14);
+    const v128_t xyzw_packed = wasm_v8x16_shuffle(xy_packed, zw_packed, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+
+    // 4x f32x4.convert_i32x4_s
+    // 4x f32x4.mul
+    // 4x f32x4.max
+    // 4x f32x4.min
+    // 4x f32x4.add
+    // 4x i32x4.sub
+    // 3x v8x16.shuffle
+    // ---------------------
+    // 29 instructions total
+
+    wasm_v128_store(output, xyzw_packed);
+    output += 16;
+  }
+}

diff --git a/src/qs8-requantization/precise-neon.c b/src/qs8-requantization/precise-neon.c
new file mode 100644
index 0000000..b95c6de
--- /dev/null
+++ b/src/qs8-requantization/precise-neon.c

@@ -0,0 +1,166 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__neon(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
+  const int32_t shift = 127 + 23 - (scale_bits >> 23);
+  assert(shift >= 24);
+  assert(shift < 56);
+
+#if defined(__aarch64__)
+  const int32x4_t vmultiplier = vdupq_n_s32(multiplier);
+#else
+  const int32x2_t vmultiplier = vdup_n_s32(multiplier);
+#endif
+  const int16x8_t vzero_point = vdupq_n_s16((int16_t) zero_point);
+  const int64x2_t vshift = vdupq_n_s64(-shift);
+  const int8x16_t vqmin = vdupq_n_s8(qmin);
+  const int8x16_t vqmax = vdupq_n_s8(qmax);
+  for (; n != 0; n -= 16) {
+    const int32x4_t x = vld1q_s32(input);
+    const int32x4_t y = vld1q_s32(input + 4);
+    const int32x4_t z = vld1q_s32(input + 8);
+    const int32x4_t w = vld1q_s32(input + 12);
+    input += 16;
+
+    const uint32x4_t x_neg_mask = vcltq_s32(x, vmovq_n_s32(0));
+    const uint32x4_t y_neg_mask = vcltq_s32(y, vmovq_n_s32(0));
+    const uint32x4_t z_neg_mask = vcltq_s32(z, vmovq_n_s32(0));
+    const uint32x4_t w_neg_mask = vcltq_s32(w, vmovq_n_s32(0));
+
+#if defined(__aarch64__)
+    const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vget_low_s32(vmultiplier));
+    const int64x2_t x23_product = vmull_high_s32(x, vmultiplier);
+    const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vget_low_s32(vmultiplier));
+    const int64x2_t y23_product = vmull_high_s32(y, vmultiplier);
+    const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vget_low_s32(vmultiplier));
+    const int64x2_t z23_product = vmull_high_s32(z, vmultiplier);
+    const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vget_low_s32(vmultiplier));
+    const int64x2_t w23_product = vmull_high_s32(w, vmultiplier);
+#else
+    const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vmultiplier);
+    const int64x2_t x23_product = vmull_s32(vget_high_s32(x), vmultiplier);
+    const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vmultiplier);
+    const int64x2_t y23_product = vmull_s32(vget_high_s32(y), vmultiplier);
+    const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vmultiplier);
+    const int64x2_t z23_product = vmull_s32(vget_high_s32(z), vmultiplier);
+    const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vmultiplier);
+    const int64x2_t w23_product = vmull_s32(vget_high_s32(w), vmultiplier);
+#endif
+
+#if defined(__aarch64__)
+    const int64x2_t x01_adjusted_product = vaddw_s32(x01_product, vreinterpret_s32_u32(vget_low_u32(x_neg_mask)));
+    const int64x2_t x23_adjusted_product = vaddw_high_s32(x23_product, vreinterpretq_s32_u32(x_neg_mask));
+    const int64x2_t y01_adjusted_product = vaddw_s32(y01_product, vreinterpret_s32_u32(vget_low_u32(y_neg_mask)));
+    const int64x2_t y23_adjusted_product = vaddw_high_s32(y23_product, vreinterpretq_s32_u32(y_neg_mask));
+    const int64x2_t z01_adjusted_product = vaddw_s32(z01_product, vreinterpret_s32_u32(vget_low_u32(z_neg_mask)));
+    const int64x2_t z23_adjusted_product = vaddw_high_s32(z23_product, vreinterpretq_s32_u32(z_neg_mask));
+    const int64x2_t w01_adjusted_product = vaddw_s32(w01_product, vreinterpret_s32_u32(vget_low_u32(w_neg_mask)));
+    const int64x2_t w23_adjusted_product = vaddw_high_s32(w23_product, vreinterpretq_s32_u32(w_neg_mask));
+#else
+    const int64x2_t x01_adjusted_product = vaddw_s32(x01_product, vreinterpret_s32_u32(vget_low_u32(x_neg_mask)));
+    const int64x2_t x23_adjusted_product = vaddw_s32(x23_product, vreinterpret_s32_u32(vget_high_u32(x_neg_mask)));
+    const int64x2_t y01_adjusted_product = vaddw_s32(y01_product, vreinterpret_s32_u32(vget_low_u32(y_neg_mask)));
+    const int64x2_t y23_adjusted_product = vaddw_s32(y23_product, vreinterpret_s32_u32(vget_high_u32(y_neg_mask)));
+    const int64x2_t z01_adjusted_product = vaddw_s32(z01_product, vreinterpret_s32_u32(vget_low_u32(z_neg_mask)));
+    const int64x2_t z23_adjusted_product = vaddw_s32(z23_product, vreinterpret_s32_u32(vget_high_u32(z_neg_mask)));
+    const int64x2_t w01_adjusted_product = vaddw_s32(w01_product, vreinterpret_s32_u32(vget_low_u32(w_neg_mask)));
+    const int64x2_t w23_adjusted_product = vaddw_s32(w23_product, vreinterpret_s32_u32(vget_high_u32(w_neg_mask)));
+#endif
+
+    const int64x2_t x01_scaled = vrshlq_s64(x01_adjusted_product, vshift);
+    const int64x2_t x23_scaled = vrshlq_s64(x23_adjusted_product, vshift);
+    const int64x2_t y01_scaled = vrshlq_s64(y01_adjusted_product, vshift);
+    const int64x2_t y23_scaled = vrshlq_s64(y23_adjusted_product, vshift);
+    const int64x2_t z01_scaled = vrshlq_s64(z01_adjusted_product, vshift);
+    const int64x2_t z23_scaled = vrshlq_s64(z23_adjusted_product, vshift);
+    const int64x2_t w01_scaled = vrshlq_s64(w01_adjusted_product, vshift);
+    const int64x2_t w23_scaled = vrshlq_s64(w23_adjusted_product, vshift);
+
+#ifdef __aarch64__
+    const int32x4_t x_scaled = vuzp1q_s32(vreinterpretq_s32_s64(x01_scaled), vreinterpretq_s32_s64(x23_scaled));
+    const int32x4_t y_scaled = vuzp1q_s32(vreinterpretq_s32_s64(y01_scaled), vreinterpretq_s32_s64(y23_scaled));
+    const int32x4_t z_scaled = vuzp1q_s32(vreinterpretq_s32_s64(z01_scaled), vreinterpretq_s32_s64(z23_scaled));
+    const int32x4_t w_scaled = vuzp1q_s32(vreinterpretq_s32_s64(w01_scaled), vreinterpretq_s32_s64(w23_scaled));
+
+    const int16x8_t xy_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(x_scaled), y_scaled), vzero_point);
+    const int16x8_t zw_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(z_scaled), w_scaled), vzero_point);
+    const int8x16_t xyzw_packed = vqmovn_high_s16(vqmovn_s16(xy_packed), zw_packed);
+#else
+    const int32x4_t x_scaled = vcombine_s32(vmovn_s64(x01_scaled), vmovn_s64(x23_scaled));
+    const int32x4_t y_scaled = vcombine_s32(vmovn_s64(y01_scaled), vmovn_s64(y23_scaled));
+    const int32x4_t z_scaled = vcombine_s32(vmovn_s64(z01_scaled), vmovn_s64(z23_scaled));
+    const int32x4_t w_scaled = vcombine_s32(vmovn_s64(w01_scaled), vmovn_s64(w23_scaled));
+
+    const int16x8_t xy_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(x_scaled), vqmovn_s32(y_scaled)), vzero_point);
+    const int16x8_t zw_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(z_scaled), vqmovn_s32(w_scaled)), vzero_point);
+    const int8x16_t xyzw_packed = vcombine_s8(vqmovn_s16(xy_packed), vqmovn_s16(zw_packed));
+#endif
+
+    const int8x16_t xyzw_clamped = vmaxq_s8(vminq_s8(xyzw_packed, vqmax), vqmin);
+
+    // AArch32 version:
+    //   4x VCLT.S32 Qd, Qm, #0
+    //   8x VMULL.S32 Qd, Dm, Dn
+    //   8x VADDW.S32 Qd, Qm, Dn
+    //   8x VRSHL.S32 Qd, Qm, Qn
+    //   8x VMOVN.S64 Dd, Qm
+    //   4x VQMOVN.S32 Dd, Qm
+    //   2x VADD.S16 Qd, Qm, Qn
+    //   2x VQMOVUN.S16 Dd, Qm
+    //   1x VMAX.U8 Qd, Qm, Qn
+    //   1x VMIN.U8 Qd, Qm, Qn
+    // ---------------------
+    // 46 instructions total
+    //
+    // AArch64 version:
+    //   4x CMLT Vd.4S, Vn.4S, #0
+    //   4x SMULL Vd.2D, Vn.2S, Vm.2S
+    //   4x SMULL2 Vd.2D, Vn.4S, Vm.4S
+    //   4x SADDW Vd.2D, Vn.2D, Vm.2S
+    //   4x SADDW2 Vd.2D, Vn.2D, Vm.4S
+    //   8x SRSHL Vd.2D, Vn.2D, Vm.2D
+    //   4x UZP1 Vd.4S, Vn.4S, Vm.4S
+    //   2x SQXTN Vd.4H, Vn.4S
+    //   2x SQXTN2 Vd.8H, Vn.4S
+    //   2x ADD Vd.8H, Vn.8H, Vm.8H
+    //   1x SQXTN Vd.8B, Vn.8H
+    //   1x SQXTN2 Vd.16B, Vn.8H
+    //   1x SMIN Vd.16B, Vn.16B, Vm.16B
+    //   1x SMAX Vd.16B, Vn.16B, Vm.16B
+    // ---------------------
+    // 42 instructions total
+
+    vst1q_s8(output, xyzw_clamped);
+    output += 16;
+  }
+}

diff --git a/src/qs8-requantization/precise-psimd.c b/src/qs8-requantization/precise-psimd.c
new file mode 100644
index 0000000..499e631
--- /dev/null
+++ b/src/qs8-requantization/precise-psimd.c

@@ -0,0 +1,139 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <psimd.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__psimd(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000);
+  const uint32_t shift = 127 + 31 - (scale_bits >> 23);
+  assert(shift >= 32);
+  assert(shift < 64);
+  const uint64_t rounding = UINT64_C(1) << (shift - 1);
+
+  const psimd_u32 vmultiplier_lo = psimd_splat_u32(multiplier & UINT32_C(0x0000FFFF));
+  const psimd_u32 vmultiplier_hi = psimd_splat_u32(multiplier >> 16);
+  const psimd_s32 vzero_point = psimd_splat_s32((int32_t) zero_point);
+  const psimd_s32 vsmin = psimd_splat_s32((int32_t) qmin - (int32_t) zero_point);
+  const psimd_s32 vsmax = psimd_splat_s32((int32_t) qmax - (int32_t) zero_point);
+  const psimd_u32 vrounding_lo = psimd_splat_u32((uint32_t) rounding);
+  const psimd_u32 vrounding_hi = psimd_splat_u32((uint32_t) (rounding >> 32));
+  const psimd_u32 vshift = psimd_splat_u32(shift - 32);
+  for (; n != 0; n -= 16) {
+    const psimd_s32 x = psimd_load_s32(input);
+    const psimd_s32 y = psimd_load_s32(input + 4);
+    const psimd_s32 z = psimd_load_s32(input + 8);
+    const psimd_s32 w = psimd_load_s32(input + 12);
+    input += 16;
+
+    const psimd_s32 x_neg_mask = x >> psimd_splat_s32(31);
+    const psimd_s32 y_neg_mask = y >> psimd_splat_s32(31);
+    const psimd_s32 z_neg_mask = z >> psimd_splat_s32(31);
+    const psimd_s32 w_neg_mask = w >> psimd_splat_s32(31);
+
+    const psimd_u32 x_abs = (psimd_u32) ((x ^ x_neg_mask) - x_neg_mask);
+    const psimd_u32 y_abs = (psimd_u32) ((y ^ y_neg_mask) - y_neg_mask);
+    const psimd_u32 z_abs = (psimd_u32) ((z ^ z_neg_mask) - z_neg_mask);
+    const psimd_u32 w_abs = (psimd_u32) ((w ^ w_neg_mask) - w_neg_mask);
+
+    const psimd_u32 x_abs_lo = x_abs & psimd_splat_u32(UINT32_C(0x0000FFFF));
+    const psimd_u32 x_abs_hi = x_abs >> psimd_splat_u32(16);
+    const psimd_u32 y_abs_lo = y_abs & psimd_splat_u32(UINT32_C(0x0000FFFF));
+    const psimd_u32 y_abs_hi = y_abs >> psimd_splat_u32(16);
+    const psimd_u32 z_abs_lo = z_abs & psimd_splat_u32(UINT32_C(0x0000FFFF));
+    const psimd_u32 z_abs_hi = z_abs >> psimd_splat_u32(16);
+    const psimd_u32 w_abs_lo = w_abs & psimd_splat_u32(UINT32_C(0x0000FFFF));
+    const psimd_u32 w_abs_hi = w_abs >> psimd_splat_u32(16);
+
+    const psimd_u32 x_product_ll = x_abs_lo * vmultiplier_lo;
+    const psimd_u32 y_product_ll = y_abs_lo * vmultiplier_lo;
+    const psimd_u32 z_product_ll = z_abs_lo * vmultiplier_lo;
+    const psimd_u32 w_product_ll = w_abs_lo * vmultiplier_lo;
+
+    const psimd_u32 x_product_lh = x_abs_lo * vmultiplier_hi + (x_product_ll >> psimd_splat_u32(16));
+    const psimd_u32 y_product_lh = y_abs_lo * vmultiplier_hi + (y_product_ll >> psimd_splat_u32(16));
+    const psimd_u32 z_product_lh = z_abs_lo * vmultiplier_hi + (z_product_ll >> psimd_splat_u32(16));
+    const psimd_u32 w_product_lh = w_abs_lo * vmultiplier_hi + (w_product_ll >> psimd_splat_u32(16));
+
+    const psimd_u32 x_product_hl = x_abs_hi * vmultiplier_lo + (x_product_lh & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+    const psimd_u32 y_product_hl = y_abs_hi * vmultiplier_lo + (y_product_lh & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+    const psimd_u32 z_product_hl = z_abs_hi * vmultiplier_lo + (z_product_lh & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+    const psimd_u32 w_product_hl = w_abs_hi * vmultiplier_lo + (w_product_lh & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+
+    const psimd_u32 x_product_lo =
+        (x_product_hl << psimd_splat_u32(16)) + (x_product_ll & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+    const psimd_u32 y_product_lo =
+        (y_product_hl << psimd_splat_u32(16)) + (y_product_ll & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+    const psimd_u32 z_product_lo =
+        (z_product_hl << psimd_splat_u32(16)) + (z_product_ll & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+    const psimd_u32 w_product_lo =
+        (w_product_hl << psimd_splat_u32(16)) + (w_product_ll & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+
+    const psimd_u32 x_product_hi =
+        x_abs_hi * vmultiplier_hi + (x_product_lh >> psimd_splat_u32(16)) + (x_product_hl >> psimd_splat_u32(16));
+    const psimd_u32 y_product_hi =
+        y_abs_hi * vmultiplier_hi + (y_product_lh >> psimd_splat_u32(16)) + (y_product_hl >> psimd_splat_u32(16));
+    const psimd_u32 z_product_hi =
+        z_abs_hi * vmultiplier_hi + (z_product_lh >> psimd_splat_u32(16)) + (z_product_hl >> psimd_splat_u32(16));
+    const psimd_u32 w_product_hi =
+        w_abs_hi * vmultiplier_hi + (w_product_lh >> psimd_splat_u32(16)) + (w_product_hl >> psimd_splat_u32(16));
+
+    const psimd_u32 x_adjusted_product =
+        (x_product_hi + vrounding_hi) - ((psimd_s32) (x_product_lo & vrounding_lo) >> psimd_splat_s32(31));
+    const psimd_u32 y_adjusted_product =
+        (y_product_hi + vrounding_hi) - ((psimd_s32) (y_product_lo & vrounding_lo) >> psimd_splat_s32(31));
+    const psimd_u32 z_adjusted_product =
+        (z_product_hi + vrounding_hi) - ((psimd_s32) (z_product_lo & vrounding_lo) >> psimd_splat_s32(31));
+    const psimd_u32 w_adjusted_product =
+        (w_product_hi + vrounding_hi) - ((psimd_s32) (w_product_lo & vrounding_lo) >> psimd_splat_s32(31));
+
+    const psimd_u32 x_abs_scaled = x_adjusted_product >> vshift;
+    const psimd_u32 y_abs_scaled = y_adjusted_product >> vshift;
+    const psimd_u32 z_abs_scaled = z_adjusted_product >> vshift;
+    const psimd_u32 w_abs_scaled = w_adjusted_product >> vshift;
+
+    const psimd_s32 x_scaled = (psimd_s32) (x_abs_scaled ^ x_neg_mask) - x_neg_mask;
+    const psimd_s32 y_scaled = (psimd_s32) (y_abs_scaled ^ y_neg_mask) - y_neg_mask;
+    const psimd_s32 z_scaled = (psimd_s32) (z_abs_scaled ^ z_neg_mask) - z_neg_mask;
+    const psimd_s32 w_scaled = (psimd_s32) (w_abs_scaled ^ w_neg_mask) - w_neg_mask;
+
+    const psimd_s32 x_clamped = psimd_max_s32(psimd_min_s32(x_scaled, vsmax), vsmin) + vzero_point;
+    const psimd_s32 y_clamped = psimd_max_s32(psimd_min_s32(y_scaled, vsmax), vsmin) + vzero_point;
+    const psimd_s32 z_clamped = psimd_max_s32(psimd_min_s32(z_scaled, vsmax), vsmin) + vzero_point;
+    const psimd_s32 w_clamped = psimd_max_s32(psimd_min_s32(w_scaled, vsmax), vsmin) + vzero_point;
+
+    const psimd_s16 xy_clamped = psimd_concat_even_s16((psimd_s16) x_clamped, (psimd_s16) y_clamped);
+    const psimd_s16 zw_clamped = psimd_concat_even_s16((psimd_s16) z_clamped, (psimd_s16) w_clamped);
+
+    const psimd_s8 xyzw_clamped = psimd_concat_even_s8((psimd_s8) xy_clamped, (psimd_s8) zw_clamped);
+
+    psimd_store_s8(output, xyzw_clamped);
+    output += 16;
+  }
+}

diff --git a/src/qs8-requantization/precise-scalar-signed64.c b/src/qs8-requantization/precise-scalar-signed64.c
new file mode 100644
index 0000000..8634266
--- /dev/null
+++ b/src/qs8-requantization/precise-scalar-signed64.c

@@ -0,0 +1,95 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/scalar-utils.h>
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__scalar_signed64(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 4 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
+  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+  assert(shift >= 24);
+  assert(shift < 56);
+
+  const int64_t rounding = INT64_C(1) << (shift - 1);
+  const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
+  const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
+  for (; n != 0; n -= 4) {
+    const int32_t x = input[0];
+    const int32_t y = input[1];
+    const int32_t z = input[2];
+    const int32_t w = input[3];
+    input += 4;
+
+    // Compute full 64-bit product of signed 32-bit factors.
+    //
+    // Note: multiplier can be treated as either signed or unsigned.
+    const int64_t x_product = (int64_t) x * (int64_t) multiplier;
+    const int64_t y_product = (int64_t) y * (int64_t) multiplier;
+    const int64_t z_product = (int64_t) z * (int64_t) multiplier;
+    const int64_t w_product = (int64_t) w * (int64_t) multiplier;
+
+    // Adjust product before subsequent shift with rounding up to simulate shift with rounding away from zero.
+    const int64_t x_adjusted_product = x_product - (int64_t)(x < 0);
+    const int64_t y_adjusted_product = y_product - (int64_t)(y < 0);
+    const int64_t z_adjusted_product = z_product - (int64_t)(z < 0);
+    const int64_t w_adjusted_product = w_product - (int64_t)(w < 0);
+
+    // Arithmetically shift the full 64-bit product right with rounding.
+    // Rounding is performed towards closest integer, with midpoints rounded up.
+    //
+    // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
+    // "right shift with rounding" instruction each line below can be represented by just one such instruction
+    // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD).
+    const int32_t x_scaled = (int32_t) asr_s64(x_adjusted_product + rounding, shift);
+    const int32_t y_scaled = (int32_t) asr_s64(y_adjusted_product + rounding, shift);
+    const int32_t z_scaled = (int32_t) asr_s64(z_adjusted_product + rounding, shift);
+    const int32_t w_scaled = (int32_t) asr_s64(w_adjusted_product + rounding, shift);
+
+    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
+    const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
+    const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
+    const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
+    const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
+
+    // Add zero point to clamped value.
+    // The result is guaranteed to be in [qmin, qmax] range.
+    //
+    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
+    const int32_t x_biased = x_clamped + zero_point;
+    const int32_t y_biased = y_clamped + zero_point;
+    const int32_t z_biased = z_clamped + zero_point;
+    const int32_t w_biased = w_clamped + zero_point;
+
+    output[0] = (int8_t) x_biased;
+    output[1] = (int8_t) y_biased;
+    output[2] = (int8_t) z_biased;
+    output[3] = (int8_t) w_biased;
+    output += 4;
+  }
+}

diff --git a/src/qs8-requantization/precise-scalar-unsigned32.c b/src/qs8-requantization/precise-scalar-unsigned32.c
new file mode 100644
index 0000000..d02d828
--- /dev/null
+++ b/src/qs8-requantization/precise-scalar-unsigned32.c

@@ -0,0 +1,130 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/scalar-utils.h>
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__scalar_unsigned32(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 4 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000);
+  const uint32_t shift = 127 + 31 - (scale_bits >> 23);
+  assert(shift >= 32);
+  assert(shift < 64);
+
+  const uint64_t rounding = UINT64_C(1) << (shift - 1);
+  const uint32_t rounding_hi = (uint32_t)(rounding >> 32);
+  const uint32_t rounding_lo = (uint32_t) rounding;
+  const uint32_t shift_minus_32 = shift - 32;
+  const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
+  const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
+  for (; n != 0; n -= 4) {
+    const int32_t x = input[0];
+    const int32_t y = input[1];
+    const int32_t z = input[2];
+    const int32_t w = input[3];
+    input += 4;
+
+    // Compute absolute value of input as unsigned 32-bit int.
+    // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
+    const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x;
+    const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y;
+    const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z;
+    const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w;
+
+    // Compute full 64-bit product of 32-bit factors.
+    const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier;
+    const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier;
+    const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier;
+    const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier;
+
+    // Shift the full 64-bit product right with rounding.
+    // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+    //
+    // Generally, this operation requires both 64-bit addition and 64-bit shift, but we use two tricks to replace
+    // 64-bit operations with 32-bit operations.
+    //
+    // To avoid full 64-bit addition we make use of three facts:
+    // - 64-bit rounding value added before the shift is a power of 2, and thus has only one bit set.
+    // - When 0x1.0p-32f <= scale < 0x1.0p-31f, then the non-zero bit in rounding is in the low 32 bits, and
+    //   rounding is exactly 0x80000000 (2**31), because rounding is 2**(scale-1) and scale >= 32. In this case,
+    //   addition of rounding can affect high 32 bits of the product only through overflow, which happens if
+    //   low 32-bit part of the product equals or exceeds 0x80000000. We can reformulate the latter condition
+    //   as low 32-bit part of the product has the bit 31 set, and then overflow happens if both the low 32-bit part
+    //   of the product and the low 32-bit part of the rounding value have bit 31 set. Since 32-bit numbers with the
+    //   bit 31 set are negative when interpreted as signed integers, we can check the overflow condition as
+    //      (int32_t) (LOW(product) & LOW(rounding)) < 0
+    // - When 0x1.0p-31f <= scale < 1.0f, then the non-zero bit is in the high 32 bits of rounding. We just need
+    //   to do 32-bit addition of high 32 bits of rounding and high 32 bits of product. This addition never
+    //   overflows because product <= 0x80000000 * 0xFFFFFF00 < 2**63 and rounding = 2**(scale-1) <= 2**62.
+    //
+    // To avoid full 64-bit shift, we leverage the fact that shift >= 32, and do it in two steps:
+    // - Shift by 32, which can be implemented by extacting the high 32-bit word on 32-bit systems.
+    // - Shift by (shift - 32), which can be implemented as a 32-bit shift of high word of addition result.
+    const uint32_t x_carry_lo = (uint32_t) ((int32_t) ((uint32_t) x_product & rounding_lo) < 0);
+    const uint32_t y_carry_lo = (uint32_t) ((int32_t) ((uint32_t) y_product & rounding_lo) < 0);
+    const uint32_t z_carry_lo = (uint32_t) ((int32_t) ((uint32_t) z_product & rounding_lo) < 0);
+    const uint32_t w_carry_lo = (uint32_t) ((int32_t) ((uint32_t) w_product & rounding_lo) < 0);
+
+    const uint32_t x_product_hi = (uint32_t) (x_product >> 32);
+    const uint32_t y_product_hi = (uint32_t) (y_product >> 32);
+    const uint32_t z_product_hi = (uint32_t) (z_product >> 32);
+    const uint32_t w_product_hi = (uint32_t) (w_product >> 32);
+
+    const uint32_t x_abs_scaled = (uint32_t) (x_product_hi + rounding_hi + x_carry_lo) >> shift_minus_32;
+    const uint32_t y_abs_scaled = (uint32_t) (y_product_hi + rounding_hi + y_carry_lo) >> shift_minus_32;
+    const uint32_t z_abs_scaled = (uint32_t) (z_product_hi + rounding_hi + z_carry_lo) >> shift_minus_32;
+    const uint32_t w_abs_scaled = (uint32_t) (w_product_hi + rounding_hi + w_carry_lo) >> shift_minus_32;
+
+    // Copy the sign of input to scaled absolute input value.
+    const int32_t x_scaled = (int32_t) (x >= 0 ? x_abs_scaled : -x_abs_scaled);
+    const int32_t y_scaled = (int32_t) (y >= 0 ? y_abs_scaled : -y_abs_scaled);
+    const int32_t z_scaled = (int32_t) (z >= 0 ? z_abs_scaled : -z_abs_scaled);
+    const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled);
+
+    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
+    const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
+    const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
+    const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
+    const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
+
+    // Add zero point to clamped value.
+    // The result is guaranteed to be in [qmin, qmax] range.
+    //
+    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
+    const int32_t x_biased = x_clamped + zero_point;
+    const int32_t y_biased = y_clamped + zero_point;
+    const int32_t z_biased = z_clamped + zero_point;
+    const int32_t w_biased = w_clamped + zero_point;
+
+    output[0] = (int8_t) x_biased;
+    output[1] = (int8_t) y_biased;
+    output[2] = (int8_t) z_biased;
+    output[3] = (int8_t) w_biased;
+    output += 4;
+  }
+}

diff --git a/src/qs8-requantization/precise-scalar-unsigned64.c b/src/qs8-requantization/precise-scalar-unsigned64.c
new file mode 100644
index 0000000..778e95b
--- /dev/null
+++ b/src/qs8-requantization/precise-scalar-unsigned64.c

@@ -0,0 +1,102 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/scalar-utils.h>
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__scalar_unsigned64(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 4 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
+  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+  assert(shift >= 24);
+  assert(shift < 56);
+
+  const uint64_t rounding = UINT64_C(1) << (shift - 1);
+  const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
+  const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
+  for (; n != 0; n -= 4) {
+    const int32_t x = input[0];
+    const int32_t y = input[1];
+    const int32_t z = input[2];
+    const int32_t w = input[3];
+    input += 4;
+
+    // Compute absolute value of input as unsigned 32-bit int.
+    // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
+    const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x;
+    const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y;
+    const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z;
+    const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w;
+
+    // Compute full 64-bit product of 32-bit factors.
+    const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier;
+    const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier;
+    const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier;
+    const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier;
+
+    // Shift the full 64-bit product right with rounding.
+    // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+    //
+    // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
+    // "right shift with rounding" instruction each line below can be represented by just one such instruction
+    // (e.g. VRSHL.U64 on ARM NEON, URSHL in ARM64 Advanced SIMD).
+    const uint32_t x_abs_scaled = (uint32_t) ((x_product + rounding) >> shift);
+    const uint32_t y_abs_scaled = (uint32_t) ((y_product + rounding) >> shift);
+    const uint32_t z_abs_scaled = (uint32_t) ((z_product + rounding) >> shift);
+    const uint32_t w_abs_scaled = (uint32_t) ((w_product + rounding) >> shift);
+
+    // Copy the sign of input to scaled absolute input value.
+    //
+    // On x86 processors with SSSE3 instruction set, this operation nicely maps to PSIGND instruction.
+    const int32_t x_scaled = (int32_t) (x >= 0 ? x_abs_scaled : -x_abs_scaled);
+    const int32_t y_scaled = (int32_t) (y >= 0 ? y_abs_scaled : -y_abs_scaled);
+    const int32_t z_scaled = (int32_t) (z >= 0 ? z_abs_scaled : -z_abs_scaled);
+    const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled);
+
+    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
+    const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
+    const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
+    const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
+    const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
+
+    // Add zero point to clamped value.
+    // The result is guaranteed to be in [qmin, qmax] range.
+    //
+    // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+    // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
+    const int32_t x_biased = x_clamped + zero_point;
+    const int32_t y_biased = y_clamped + zero_point;
+    const int32_t z_biased = z_clamped + zero_point;
+    const int32_t w_biased = w_clamped + zero_point;
+
+    output[0] = (int8_t) x_biased;
+    output[1] = (int8_t) y_biased;
+    output[2] = (int8_t) z_biased;
+    output[3] = (int8_t) w_biased;
+    output += 4;
+  }
+}

diff --git a/src/qs8-requantization/precise-sse2.c b/src/qs8-requantization/precise-sse2.c
new file mode 100644
index 0000000..192f64e
--- /dev/null
+++ b/src/qs8-requantization/precise-sse2.c

@@ -0,0 +1,131 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <emmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__sse2(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
+  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+  assert(shift >= 24);
+  assert(shift < 56);
+  const uint64_t rounding = UINT64_C(1) << (shift - 1);
+
+  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+  const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+  const __m128i vqmin = _mm_set1_epi8((short) qmin);
+  const __m128i vqmax = _mm_set1_epi8((short) qmax);
+  const __m128i vshift = _mm_cvtsi32_si128((int) shift);
+  const __m128i vrounding = _mm_set1_epi64x(rounding);
+  for (; n != 0; n -= 16) {
+    const __m128i x = _mm_loadu_si128((const __m128i*) input);
+    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+    input += 16;
+
+    const __m128i x_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), x);
+    const __m128i y_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), y);
+    const __m128i z_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), z);
+    const __m128i w_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), w);
+
+    const __m128i x_abs0123 = _mm_sub_epi32(_mm_xor_si128(x, x_neg_mask), x_neg_mask);
+    const __m128i y_abs0123 = _mm_sub_epi32(_mm_xor_si128(y, y_neg_mask), y_neg_mask);
+    const __m128i z_abs0123 = _mm_sub_epi32(_mm_xor_si128(z, z_neg_mask), z_neg_mask);
+    const __m128i w_abs0123 = _mm_sub_epi32(_mm_xor_si128(w, w_neg_mask), w_neg_mask);
+
+    const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+
+    const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier);
+    const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier);
+    const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier);
+    const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier);
+
+    const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier);
+    const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier);
+    const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier);
+    const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier);
+
+    const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshift);
+    const __m128i x_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(x_absmul13, vrounding), vshift);
+    const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshift);
+    const __m128i y_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(y_absmul13, vrounding), vshift);
+    const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshift);
+    const __m128i z_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(z_absmul13, vrounding), vshift);
+    const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshift);
+    const __m128i w_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(w_absmul13, vrounding), vshift);
+
+    const __m128i x_abs_scaled0213 = _mm_castps_si128(
+        _mm_shuffle_ps(_mm_castsi128_ps(x_abs_scaled02), _mm_castsi128_ps(x_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+    const __m128i y_abs_scaled0213 = _mm_castps_si128(
+        _mm_shuffle_ps(_mm_castsi128_ps(y_abs_scaled02), _mm_castsi128_ps(y_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+    const __m128i z_abs_scaled0213 = _mm_castps_si128(
+        _mm_shuffle_ps(_mm_castsi128_ps(z_abs_scaled02), _mm_castsi128_ps(z_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+    const __m128i w_abs_scaled0213 = _mm_castps_si128(
+        _mm_shuffle_ps(_mm_castsi128_ps(w_abs_scaled02), _mm_castsi128_ps(w_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+
+    const __m128i x_abs_scaled = _mm_shuffle_epi32(x_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+    const __m128i y_abs_scaled = _mm_shuffle_epi32(y_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+    const __m128i z_abs_scaled = _mm_shuffle_epi32(z_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+    const __m128i w_abs_scaled = _mm_shuffle_epi32(w_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+
+    const __m128i x_scaled = _mm_sub_epi32(_mm_xor_si128(x_abs_scaled, x_neg_mask), x_neg_mask);
+    const __m128i y_scaled = _mm_sub_epi32(_mm_xor_si128(y_abs_scaled, y_neg_mask), y_neg_mask);
+    const __m128i z_scaled = _mm_sub_epi32(_mm_xor_si128(z_abs_scaled, z_neg_mask), z_neg_mask);
+    const __m128i w_scaled = _mm_sub_epi32(_mm_xor_si128(w_abs_scaled, w_neg_mask), w_neg_mask);
+
+    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+    const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin);
+    const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin);
+    const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped);
+
+    // 4x PXOR (setzero)
+    // 8x PSUBD
+    // 8x PXOR
+    // 8x PSHUFD
+    // 8x PMULUDQ
+    // 8x PSRLQ
+    // 8x PADDQ
+    // 4x SHUFPS
+    // 2x PACKSSDW
+    // 2x PADDSW
+    // 2x PMAXSW
+    // 2x PMINSW
+    // 1x PACKSSWB
+    // ---------------------
+    // 63 instructions total
+
+    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+    output += 16;
+  }
+}

diff --git a/src/qs8-requantization/precise-sse4.c b/src/qs8-requantization/precise-sse4.c
new file mode 100644
index 0000000..2cb3a02
--- /dev/null
+++ b/src/qs8-requantization/precise-sse4.c

@@ -0,0 +1,117 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <smmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__sse4(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000);
+  const uint32_t shift = 127 + 31 - (scale_bits >> 23);
+  assert(shift >= 32);
+  assert(shift < 64);
+  const uint64_t rounding = UINT64_C(1) << (shift - 1);
+
+  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+  const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+  const __m128i vqmin = _mm_set1_epi8((char) qmin);
+  const __m128i vqmax = _mm_set1_epi8((char) qmax);
+  const __m128i vshiftlo = _mm_cvtsi32_si128((int) shift);
+  const __m128i vshifthi = _mm_cvtsi32_si128((int) shift - 32);
+  const __m128i vrounding = _mm_set1_epi64x(rounding);
+  for (; n != 0; n -= 16) {
+    const __m128i x = _mm_loadu_si128((const __m128i*) input);
+    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+    input += 16;
+
+    const __m128i x_abs0123 = _mm_abs_epi32(x);
+    const __m128i y_abs0123 = _mm_abs_epi32(y);
+    const __m128i z_abs0123 = _mm_abs_epi32(z);
+    const __m128i w_abs0123 = _mm_abs_epi32(w);
+
+    const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+
+    const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier);
+    const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier);
+    const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier);
+    const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier);
+
+    const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier);
+    const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier);
+    const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier);
+    const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier);
+
+    const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshiftlo);
+    const __m128i x_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(x_absmul13, vrounding), vshifthi);
+    const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshiftlo);
+    const __m128i y_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(y_absmul13, vrounding), vshifthi);
+    const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshiftlo);
+    const __m128i z_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(z_absmul13, vrounding), vshifthi);
+    const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshiftlo);
+    const __m128i w_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(w_absmul13, vrounding), vshifthi);
+
+    const __m128i x_abs_scaled = _mm_blend_epi16(x_abs_scaled02, x_abs_scaled13, 0xCC);
+    const __m128i y_abs_scaled = _mm_blend_epi16(y_abs_scaled02, y_abs_scaled13, 0xCC);
+    const __m128i z_abs_scaled = _mm_blend_epi16(z_abs_scaled02, z_abs_scaled13, 0xCC);
+    const __m128i w_abs_scaled = _mm_blend_epi16(w_abs_scaled02, w_abs_scaled13, 0xCC);
+
+    const __m128i x_scaled = _mm_sign_epi32(x_abs_scaled, x);
+    const __m128i y_scaled = _mm_sign_epi32(y_abs_scaled, y);
+    const __m128i z_scaled = _mm_sign_epi32(z_abs_scaled, z);
+    const __m128i w_scaled = _mm_sign_epi32(w_abs_scaled, w);
+
+    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+    const __m128i xyzw_packed = _mm_packs_epi16(xy_packed, zw_packed);
+    const __m128i xyzw_clamped = _mm_max_epi8(_mm_min_epi8(xyzw_packed, vqmax), vqmin);
+
+    // 4x PABSD
+    // 4x PSHUFD
+    // 8x PMULUDQ
+    // 4x PSRLQ
+    // 4x PSRLD
+    // 8x PADDQ
+    // 4x PBLENDW
+    // 4x PSIGND
+    // 2x PACKSSDW
+    // 2x PADDSW
+    // 1x PACKSSWB
+    // 1x PMAXSB
+    // 1x PMINSB
+    // ---------------------
+    // 47 instructions total
+
+    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+    output += 16;
+  }
+}

diff --git a/src/qs8-requantization/precise-ssse3.c b/src/qs8-requantization/precise-ssse3.c
new file mode 100644
index 0000000..78c2c76
--- /dev/null
+++ b/src/qs8-requantization/precise-ssse3.c

@@ -0,0 +1,125 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <tmmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__ssse3(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
+  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+  assert(shift >= 24);
+  assert(shift < 56);
+  const uint64_t rounding = UINT64_C(1) << (shift - 1);
+
+  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+  const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+  const __m128i vqmin = _mm_set1_epi8((char) qmin);
+  const __m128i vqmax = _mm_set1_epi8((char) qmax);
+  const __m128i vshift = _mm_cvtsi32_si128((int) shift);
+  const __m128i vrounding = _mm_set1_epi64x(rounding);
+  for (; n != 0; n -= 16) {
+    const __m128i x = _mm_loadu_si128((const __m128i*) input);
+    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+    input += 16;
+
+    const __m128i x_abs0123 = _mm_abs_epi32(x);
+    const __m128i y_abs0123 = _mm_abs_epi32(y);
+    const __m128i z_abs0123 = _mm_abs_epi32(z);
+    const __m128i w_abs0123 = _mm_abs_epi32(w);
+
+    const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+
+    const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier);
+    const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier);
+    const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier);
+    const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier);
+
+    const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier);
+    const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier);
+    const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier);
+    const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier);
+
+    const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshift);
+    const __m128i x_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(x_absmul13, vrounding), vshift);
+    const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshift);
+    const __m128i y_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(y_absmul13, vrounding), vshift);
+    const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshift);
+    const __m128i z_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(z_absmul13, vrounding), vshift);
+    const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshift);
+    const __m128i w_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(w_absmul13, vrounding), vshift);
+
+    const __m128i x_abs_scaled0213 = _mm_castps_si128(
+        _mm_shuffle_ps(_mm_castsi128_ps(x_abs_scaled02), _mm_castsi128_ps(x_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+    const __m128i y_abs_scaled0213 = _mm_castps_si128(
+        _mm_shuffle_ps(_mm_castsi128_ps(y_abs_scaled02), _mm_castsi128_ps(y_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+    const __m128i z_abs_scaled0213 = _mm_castps_si128(
+        _mm_shuffle_ps(_mm_castsi128_ps(z_abs_scaled02), _mm_castsi128_ps(z_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+    const __m128i w_abs_scaled0213 = _mm_castps_si128(
+        _mm_shuffle_ps(_mm_castsi128_ps(w_abs_scaled02), _mm_castsi128_ps(w_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+
+    const __m128i x_abs_scaled = _mm_shuffle_epi32(x_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+    const __m128i y_abs_scaled = _mm_shuffle_epi32(y_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+    const __m128i z_abs_scaled = _mm_shuffle_epi32(z_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+    const __m128i w_abs_scaled = _mm_shuffle_epi32(w_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+
+    const __m128i x_scaled = _mm_sign_epi32(x_abs_scaled, x);
+    const __m128i y_scaled = _mm_sign_epi32(y_abs_scaled, y);
+    const __m128i z_scaled = _mm_sign_epi32(z_abs_scaled, z);
+    const __m128i w_scaled = _mm_sign_epi32(w_abs_scaled, w);
+
+    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+    const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin);
+    const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin);
+    const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped);
+
+    // 4x PABSD
+    // 8x PSHUFD
+    // 8x PMULUDQ
+    // 8x PSRLQ
+    // 8x PADDQ
+    // 4x SHUFPS
+    // 4x PSIGND
+    // 2x PACKSSDW
+    // 2x PADDSW
+    // 2x PMAXSW
+    // 2x PMINSW
+    // 1x PACKSSWB
+    // ---------------------
+    // 53 instructions total
+
+    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+    output += 16;
+  }
+}

diff --git a/src/qs8-requantization/q31-neon.c b/src/qs8-requantization/q31-neon.c
new file mode 100644
index 0000000..9756293
--- /dev/null
+++ b/src/qs8-requantization/q31-neon.c

@@ -0,0 +1,125 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_q31__neon(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  // Compute requantization parameters.
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
+  const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+  assert(multiplier >= INT32_C(0x40000000));
+  assert(multiplier <= INT32_C(0x7FFFFF80));
+
+  // Shift is in [0, 31] range.
+  const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+  assert(shift >= 0);
+  assert(shift < 32);
+
+  const int32x4_t vmultiplier = vdupq_n_s32(multiplier);
+  const int16x8_t vzero_point = vdupq_n_s16((int16_t) zero_point);
+  const int32x4_t vshift = vdupq_n_s32(-shift);
+  const int32x4_t vshift_eq_0_mask = vreinterpretq_s32_u32(vceqq_s32(vshift, vmovq_n_s32(0)));
+  const int8x16_t vqmin = vdupq_n_s8(qmin);
+  const int8x16_t vqmax = vdupq_n_s8(qmax);
+  for (; n != 0; n -= 16) {
+    const int32x4_t x = vld1q_s32(input);
+    const int32x4_t y = vld1q_s32(input + 4);
+    const int32x4_t z = vld1q_s32(input + 8);
+    const int32x4_t w = vld1q_s32(input + 12);
+    input += 16;
+
+    // Directly use VQRDMULH/SQRDMULH instruction for Q31 multiplication with rounding.
+    // Although these instruction saturate out-of-range outputs, we never hit this case in requantization.
+    const int32x4_t x_product = vqrdmulhq_s32(x, vmultiplier);
+    const int32x4_t y_product = vqrdmulhq_s32(y, vmultiplier);
+    const int32x4_t z_product = vqrdmulhq_s32(z, vmultiplier);
+    const int32x4_t w_product = vqrdmulhq_s32(w, vmultiplier);
+
+    // Shift the 32-bit product right with rounding.
+    // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+    //
+    // We leverage the "right shift with rounding" instruction (VRSHL.S32 on ARM NEON, SRSHL in ARM64 Advanced SIMD) to
+    // do the shift. However, as this instruction rounds midpoints up, rather than away from zero, we adjust the input
+    // by subtracting 1 from negative values, but only if shift is non-zero.
+    const int32x4_t x_adjusted_product = vsraq_n_s32(x_product, vbicq_s32(x, vshift_eq_0_mask), 31);
+    const int32x4_t y_adjusted_product = vsraq_n_s32(y_product, vbicq_s32(y, vshift_eq_0_mask), 31);
+    const int32x4_t z_adjusted_product = vsraq_n_s32(z_product, vbicq_s32(z, vshift_eq_0_mask), 31);
+    const int32x4_t w_adjusted_product = vsraq_n_s32(w_product, vbicq_s32(w, vshift_eq_0_mask), 31);
+
+    const int32x4_t x_scaled = vrshlq_s32(x_adjusted_product, vshift);
+    const int32x4_t y_scaled = vrshlq_s32(y_adjusted_product, vshift);
+    const int32x4_t z_scaled = vrshlq_s32(z_adjusted_product, vshift);
+    const int32x4_t w_scaled = vrshlq_s32(w_adjusted_product, vshift);
+
+#ifdef __aarch64__
+    const int16x8_t xy_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(x_scaled), y_scaled), vzero_point);
+    const int16x8_t zw_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(z_scaled), w_scaled), vzero_point);
+    const int8x16_t xyzw_packed = vqmovn_high_s16(vqmovn_s16(xy_packed), zw_packed);
+#else
+    const int16x8_t xy_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(x_scaled), vqmovn_s32(y_scaled)), vzero_point);
+    const int16x8_t zw_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(z_scaled), vqmovn_s32(w_scaled)), vzero_point);
+    const int8x16_t xyzw_packed = vcombine_s8(vqmovn_s16(xy_packed), vqmovn_s16(zw_packed));
+#endif
+
+    const int8x16_t xyzw_clamped = vmaxq_s8(vminq_s8(xyzw_packed, vqmax), vqmin);
+
+    // AArch32 version:
+    //   4x VQRDMULH.S32 Qd, Qm, Qn
+    //   4x VAND Qd, Qm, Dn
+    //   4x VSRA.S32 Qd, Qm, #31
+    //   4x VRSHL.S32 Qd, Qm, Qn
+    //   4x VQMOVN.S32 Dd, Qm
+    //   2x VADD.S16 Qd, Qm, Qn
+    //   2x VQMOVN.S16 Dd, Qm
+    //   1x VMAX.S8 Qd, Qm, Qn
+    //   1x VMIN.S8 Qd, Qm, Qn
+    // ---------------------
+    // 26 instructions total
+    //
+    // AArch64 version:
+    //   4x SQRDMULH Vd.4S, Vn.4S, Vm.4S
+    //   4x AND Vd.16B, Vn.16B, Vm.16B
+    //   4x SSRA Vd.4S, Vn.4S, #31
+    //   4x SRSHL Vd.4S, Vn.4S, Vm.4S
+    //   2x SQXTN Vd.4H, Vn.4S
+    //   2x SQXTN2 Vd.8H, Vn.4S
+    //   2x ADD Vd.8H, Vn.8H, Vm.8H
+    //   1x SQXTN Vd.8B, Vn.8H
+    //   1x SQXTN2 Vd.16B, Vn.8H
+    //   1x SMIN Vd.16B, Vn.16B, Vm.16B
+    //   1x SMAX Vd.16B, Vn.16B, Vm.16B
+    // ---------------------
+    // 26 instructions total
+
+    vst1q_s8(output, xyzw_clamped);
+    output += 16;
+  }
+}

diff --git a/src/qs8-requantization/q31-scalar.c b/src/qs8-requantization/q31-scalar.c
new file mode 100644
index 0000000..ce843f0
--- /dev/null
+++ b/src/qs8-requantization/q31-scalar.c

@@ -0,0 +1,130 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/scalar-utils.h>
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_q31__scalar(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 4 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  // Compute requantization parameters.
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
+  const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+  assert(multiplier >= INT32_C(0x40000000));
+  assert(multiplier <= INT32_C(0x7FFFFF80));
+
+  // Shift is in [0, 31] range.
+  const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+  assert(shift >= 0);
+  assert(shift < 32);
+
+  const int64_t q31rounding = INT64_C(0x40000000);
+  const int32_t remainder_mask = (int32_t)((UINT32_C(1) << shift) - UINT32_C(1));
+  const int32_t threshold = (int32_t)((uint32_t) remainder_mask >> 1);
+  const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
+  const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
+  for (; n != 0; n -= 4) {
+    const int32_t x = input[0];
+    const int32_t y = input[1];
+    const int32_t z = input[2];
+    const int32_t w = input[3];
+    input += 4;
+
+    // Compute full 64-bit product of signed 32-bit factors.
+    //
+    // Note: multiplier can be treated as either signed or unsigned.
+    const int64_t x_product = (int64_t) x * (int64_t) multiplier;
+    const int64_t y_product = (int64_t) y * (int64_t) multiplier;
+    const int64_t z_product = (int64_t) z * (int64_t) multiplier;
+    const int64_t w_product = (int64_t) w * (int64_t) multiplier;
+
+    // Get the Q31 multiplication result by extracting bits 31-62 of the product, with rounding up.
+    // Add rounding value (0x40000000) and then shift right by 31 bits and extract the low 32-bit word.
+    // Note: casts to unsigned types are needed to avoid undefined behavior.
+    // Given the multiplier range, the result of Q31 multiplication is in [-2147483520, 2147483519] range.
+    const int32_t x_q31product = (int32_t) (uint32_t) ((uint64_t) (x_product + q31rounding) >> 31);
+    const int32_t y_q31product = (int32_t) (uint32_t) ((uint64_t) (y_product + q31rounding) >> 31);
+    const int32_t z_q31product = (int32_t) (uint32_t) ((uint64_t) (z_product + q31rounding) >> 31);
+    const int32_t w_q31product = (int32_t) (uint32_t) ((uint64_t) (w_product + q31rounding) >> 31);
+
+    // Arithmetically shift the adjusted product right with rounding.
+    // Rounding is performed towards closest integer, with midpoints rounded away from zero.
+    //
+    // Shift with correct rounding could be efficiently implemented by pre-adding rounding constant, but with input in
+    // [-2147483520, 2147483519] range and rounding constant up to 2**30 we can't rule out overflow. This limitation
+    // leaves us with 3 options:
+    // 1. Extend input to 64-bit signed integer, perform addition and shift on 64-bit integers, then truncate result
+    //    to 32 bits.
+    // 2. Detect overflow and handle this situation separately. Note that overflow is possible only when input is
+    //    positive, and even when addition of a rounding constant overflows 32-bit signed integer, it still doesn't
+    //    overflow 32-bit unsigned integer. Thus, in case of signed overflow, we can compute the result using unsigned
+    //    arithmetics, specifically using logical shift right instead of arithmetic shift right.
+    // 3. Performs arithmetic shift as is, which will produce division result rounded down. Then compute remainder of
+    //    this division by a power of 2, and adjust the result. Result needs adjustment (increment by 1) when
+    //     - input is positive, shift is non-zero, and remainder >= 2**(shift - 1), e.g. 10 >> 2 needs adjustment
+    //     - input is negative, shift is non-zero, and remainder > 2**(shift - 1), e.g. -10 >> 2 doesn't need adjustment
+    //    These conditions can be generalized as
+    //        remainder + (input <= 0) > 2**(shift - 1)
+    //    or equivalently
+    //        remainder - (input < 0) > ((2**shift - 1) >> 1)
+    //    When shift is 0, remainder is 0 as well, the last condition is always false, and no adjustment is done.
+    //
+    // Among these options, option 3 is the most performant across the board, although option 1 is promising for 64-bit
+    // instruction sets.
+    const int32_t x_remainder = (x_q31product & remainder_mask) - (int32_t) (x_q31product < 0);
+    const int32_t y_remainder = (y_q31product & remainder_mask) - (int32_t) (y_q31product < 0);
+    const int32_t z_remainder = (z_q31product & remainder_mask) - (int32_t) (z_q31product < 0);
+    const int32_t w_remainder = (w_q31product & remainder_mask) - (int32_t) (w_q31product < 0);
+
+    const int32_t x_scaled = asr_s32(x_q31product, shift) + (int32_t) (x_remainder > threshold);
+    const int32_t y_scaled = asr_s32(y_q31product, shift) + (int32_t) (y_remainder > threshold);
+    const int32_t z_scaled = asr_s32(z_q31product, shift) + (int32_t) (z_remainder > threshold);
+    const int32_t w_scaled = asr_s32(w_q31product, shift) + (int32_t) (w_remainder > threshold);
+
+    // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
+    const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
+    const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
+    const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
+    const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
+
+    // Add zero point to clamped value.
+    // The result is guaranteed to be in [qmin, qmax] range.
+    //
+    // This addition can be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+    // range, so addition of zero point (which is in [-128, 127] range) can not overflow signed 32-bit integer.
+    const int32_t x_biased = x_clamped + zero_point;
+    const int32_t y_biased = y_clamped + zero_point;
+    const int32_t z_biased = z_clamped + zero_point;
+    const int32_t w_biased = w_clamped + zero_point;
+
+    output[0] = (int8_t) x_biased;
+    output[1] = (int8_t) y_biased;
+    output[2] = (int8_t) z_biased;
+    output[3] = (int8_t) w_biased;
+    output += 4;
+  }
+}

diff --git a/src/qs8-requantization/q31-sse2.c b/src/qs8-requantization/q31-sse2.c
new file mode 100644
index 0000000..c5a8500
--- /dev/null
+++ b/src/qs8-requantization/q31-sse2.c

@@ -0,0 +1,188 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <emmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_q31__sse2(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  // Compute requantization parameters.
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
+  const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+  assert(multiplier >= INT32_C(0x40000000));
+  assert(multiplier <= INT32_C(0x7FFFFF80));
+
+  // Shift is in [0, 31] range.
+  const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+  assert(shift >= 0);
+  assert(shift < 32);
+
+  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+  const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+  const __m128i vqmin = _mm_set1_epi16((short) qmin);
+  const __m128i vqmax = _mm_set1_epi16((short) qmax);
+  const __m128i vshift = _mm_cvtsi32_si128((int) shift);
+  const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+  const __m128i vremainder_mask = _mm_set1_epi32((int) remainder_mask);
+  const __m128i vthreshold = _mm_set1_epi32((int) (remainder_mask >> 1));
+  const __m128i vq31rounding = _mm_set1_epi64x(UINT64_C(0x40000000));
+  for (; n != 0; n -= 16) {
+    const __m128i x = _mm_loadu_si128((const __m128i*) input);
+    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+    input += 16;
+
+    const __m128i x_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), x);
+    const __m128i y_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), y);
+    const __m128i z_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), z);
+    const __m128i w_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), w);
+
+    const __m128i x_abs = _mm_sub_epi32(_mm_xor_si128(x, x_neg_mask), x_neg_mask);
+    const __m128i y_abs = _mm_sub_epi32(_mm_xor_si128(y, y_neg_mask), y_neg_mask);
+    const __m128i z_abs = _mm_sub_epi32(_mm_xor_si128(z, z_neg_mask), z_neg_mask);
+    const __m128i w_abs = _mm_sub_epi32(_mm_xor_si128(w, w_neg_mask), w_neg_mask);
+
+    const __m128i x_abs_rev = _mm_shuffle_epi32(x_abs, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i y_abs_rev = _mm_shuffle_epi32(y_abs, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i z_abs_rev = _mm_shuffle_epi32(z_abs, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i w_abs_rev = _mm_shuffle_epi32(w_abs, _MM_SHUFFLE(2, 3, 0, 1));
+
+    const __m128i x_abs_product_even = _mm_mul_epu32(x_abs, vmultiplier);
+    const __m128i y_abs_product_even = _mm_mul_epu32(y_abs, vmultiplier);
+    const __m128i z_abs_product_even = _mm_mul_epu32(z_abs, vmultiplier);
+    const __m128i w_abs_product_even = _mm_mul_epu32(w_abs, vmultiplier);
+
+    const __m128i x_neg_mask_even = _mm_shuffle_epi32(x_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i y_neg_mask_even = _mm_shuffle_epi32(y_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i z_neg_mask_even = _mm_shuffle_epi32(z_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i w_neg_mask_even = _mm_shuffle_epi32(w_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+
+    const __m128i x_product_even = _mm_sub_epi64(_mm_xor_si128(x_abs_product_even, x_neg_mask_even), x_neg_mask_even);
+    const __m128i y_product_even = _mm_sub_epi64(_mm_xor_si128(y_abs_product_even, y_neg_mask_even), y_neg_mask_even);
+    const __m128i z_product_even = _mm_sub_epi64(_mm_xor_si128(z_abs_product_even, z_neg_mask_even), z_neg_mask_even);
+    const __m128i w_product_even = _mm_sub_epi64(_mm_xor_si128(w_abs_product_even, w_neg_mask_even), w_neg_mask_even);
+
+    const __m128i x_rounded_product_even = _mm_add_epi64(x_product_even, vq31rounding);
+    const __m128i y_rounded_product_even = _mm_add_epi64(y_product_even, vq31rounding);
+    const __m128i z_rounded_product_even = _mm_add_epi64(z_product_even, vq31rounding);
+    const __m128i w_rounded_product_even = _mm_add_epi64(w_product_even, vq31rounding);
+
+    const __m128i x_abs_product_odd = _mm_mul_epu32(x_abs_rev, vmultiplier);
+    const __m128i y_abs_product_odd = _mm_mul_epu32(y_abs_rev, vmultiplier);
+    const __m128i z_abs_product_odd = _mm_mul_epu32(z_abs_rev, vmultiplier);
+    const __m128i w_abs_product_odd = _mm_mul_epu32(w_abs_rev, vmultiplier);
+
+    const __m128i x_neg_mask_odd = _mm_shuffle_epi32(x_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128i y_neg_mask_odd = _mm_shuffle_epi32(y_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128i z_neg_mask_odd = _mm_shuffle_epi32(z_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128i w_neg_mask_odd = _mm_shuffle_epi32(w_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+
+    const __m128i x_product_odd = _mm_sub_epi64(_mm_xor_si128(x_abs_product_odd, x_neg_mask_odd), x_neg_mask_odd);
+    const __m128i y_product_odd = _mm_sub_epi64(_mm_xor_si128(y_abs_product_odd, y_neg_mask_odd), y_neg_mask_odd);
+    const __m128i z_product_odd = _mm_sub_epi64(_mm_xor_si128(z_abs_product_odd, z_neg_mask_odd), z_neg_mask_odd);
+    const __m128i w_product_odd = _mm_sub_epi64(_mm_xor_si128(w_abs_product_odd, w_neg_mask_odd), w_neg_mask_odd);
+
+    const __m128i x_rounded_product_odd = _mm_add_epi64(x_product_odd, vq31rounding);
+    const __m128i y_rounded_product_odd = _mm_add_epi64(y_product_odd, vq31rounding);
+    const __m128i z_rounded_product_odd = _mm_add_epi64(z_product_odd, vq31rounding);
+    const __m128i w_rounded_product_odd = _mm_add_epi64(w_product_odd, vq31rounding);
+
+    const __m128i x_q31product_even = _mm_srli_epi64(x_rounded_product_even, 31);
+    const __m128i x_q31product_odd = _mm_srli_epi64(x_rounded_product_odd, 31);
+    const __m128i y_q31product_even = _mm_srli_epi64(y_rounded_product_even, 31);
+    const __m128i y_q31product_odd = _mm_srli_epi64(y_rounded_product_odd, 31);
+    const __m128i z_q31product_even = _mm_srli_epi64(z_rounded_product_even, 31);
+    const __m128i z_q31product_odd = _mm_srli_epi64(z_rounded_product_odd, 31);
+    const __m128i w_q31product_even = _mm_srli_epi64(w_rounded_product_even, 31);
+    const __m128i w_q31product_odd = _mm_srli_epi64(w_rounded_product_odd, 31);
+
+    const __m128i x_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+        _mm_castsi128_ps(x_q31product_even), _mm_castsi128_ps(x_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+    const __m128i y_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+        _mm_castsi128_ps(y_q31product_even), _mm_castsi128_ps(y_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+    const __m128i z_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+        _mm_castsi128_ps(z_q31product_even), _mm_castsi128_ps(z_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+    const __m128i w_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+        _mm_castsi128_ps(w_q31product_even), _mm_castsi128_ps(w_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+
+    const __m128i x_q31product = _mm_shuffle_epi32(x_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+    const __m128i y_q31product = _mm_shuffle_epi32(y_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+    const __m128i z_q31product = _mm_shuffle_epi32(z_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+    const __m128i w_q31product = _mm_shuffle_epi32(w_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+
+    const __m128i x_remainder =
+        _mm_add_epi32(_mm_and_si128(x_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), x_q31product));
+    const __m128i y_remainder =
+        _mm_add_epi32(_mm_and_si128(y_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), y_q31product));
+    const __m128i z_remainder =
+        _mm_add_epi32(_mm_and_si128(z_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), z_q31product));
+    const __m128i w_remainder =
+        _mm_add_epi32(_mm_and_si128(w_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), w_q31product));
+
+    const __m128i x_scaled =
+        _mm_sub_epi32(_mm_sra_epi32(x_q31product, vshift), _mm_cmpgt_epi32(x_remainder, vthreshold));
+    const __m128i y_scaled =
+        _mm_sub_epi32(_mm_sra_epi32(y_q31product, vshift), _mm_cmpgt_epi32(y_remainder, vthreshold));
+    const __m128i z_scaled =
+        _mm_sub_epi32(_mm_sra_epi32(z_q31product, vshift), _mm_cmpgt_epi32(z_remainder, vthreshold));
+    const __m128i w_scaled =
+        _mm_sub_epi32(_mm_sra_epi32(w_q31product, vshift), _mm_cmpgt_epi32(w_remainder, vthreshold));
+
+    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+    const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin);
+    const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin);
+    const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped);
+
+    // 16x PSHUFD
+    // 4x SHUFPS
+    // 8x PMULUDQ
+    // 8x PXOR (setzero)
+    // 12x PXOR
+    // 4x PAND
+    // 8x PADDQ
+    // 4x PADDD
+    // 8x PSUBQ
+    // 8x PSUBD
+    // 8x PSRLQ (immediate)
+    // 4x PSRAD (register)
+    // 12x PCMPGTD
+    // 2x PACKSSDW
+    // 2x PADDSW
+    // 2x PMAXSW
+    // 2x PMINSW
+    // 1x PACKSSWB
+    // ---------------------
+    // 113 instructions total
+
+    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+    output += 16;
+  }
+}

diff --git a/src/qs8-requantization/q31-sse4.c b/src/qs8-requantization/q31-sse4.c
new file mode 100644
index 0000000..11dd400
--- /dev/null
+++ b/src/qs8-requantization/q31-sse4.c

@@ -0,0 +1,136 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <smmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_q31__sse4(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  // Compute requantization parameters.
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
+  const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+  assert(multiplier >= INT32_C(0x40000000));
+  assert(multiplier <= INT32_C(0x7FFFFF80));
+
+  // Shift is in [0, 31] range.
+  const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+  assert(shift >= 0);
+  assert(shift < 32);
+
+  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+  const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+  const __m128i vqmin = _mm_set1_epi8((char) qmin);
+  const __m128i vqmax = _mm_set1_epi8((char) qmax);
+  const __m128i vshift = _mm_cvtsi32_si128((int) shift);
+  const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+  const __m128i vremainder_mask = _mm_set1_epi32((int) remainder_mask);
+  const __m128i vthreshold = _mm_set1_epi32((int) (remainder_mask >> 1));
+  const __m128i vq31rounding = _mm_set1_epi64x(UINT64_C(0x40000000));
+  for (; n != 0; n -= 16) {
+    const __m128i x = _mm_loadu_si128((const __m128i*) input);
+    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+    input += 16;
+
+    const __m128i x_rev = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i y_rev = _mm_shuffle_epi32(y, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i z_rev = _mm_shuffle_epi32(z, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i w_rev = _mm_shuffle_epi32(w, _MM_SHUFFLE(2, 3, 0, 1));
+
+    const __m128i x_product_even = _mm_add_epi64(_mm_mul_epi32(x, vmultiplier), vq31rounding);
+    const __m128i y_product_even = _mm_add_epi64(_mm_mul_epi32(y, vmultiplier), vq31rounding);
+    const __m128i z_product_even = _mm_add_epi64(_mm_mul_epi32(z, vmultiplier), vq31rounding);
+    const __m128i w_product_even = _mm_add_epi64(_mm_mul_epi32(w, vmultiplier), vq31rounding);
+
+    const __m128i x_product_odd = _mm_add_epi64(_mm_mul_epi32(x_rev, vmultiplier), vq31rounding);
+    const __m128i y_product_odd = _mm_add_epi64(_mm_mul_epi32(y_rev, vmultiplier), vq31rounding);
+    const __m128i z_product_odd = _mm_add_epi64(_mm_mul_epi32(z_rev, vmultiplier), vq31rounding);
+    const __m128i w_product_odd = _mm_add_epi64(_mm_mul_epi32(w_rev, vmultiplier), vq31rounding);
+
+    const __m128i x_q31product_even = _mm_srli_epi64(x_product_even, 31);
+    const __m128i x_q31product_odd = _mm_add_epi64(x_product_odd, x_product_odd);
+    const __m128i y_q31product_even = _mm_srli_epi64(y_product_even, 31);
+    const __m128i y_q31product_odd = _mm_add_epi64(y_product_odd, y_product_odd);
+    const __m128i z_q31product_even = _mm_srli_epi64(z_product_even, 31);
+    const __m128i z_q31product_odd = _mm_add_epi64(z_product_odd, z_product_odd);
+    const __m128i w_q31product_even = _mm_srli_epi64(w_product_even, 31);
+    const __m128i w_q31product_odd = _mm_add_epi64(w_product_odd, w_product_odd);
+
+    const __m128i x_q31product = _mm_blend_epi16(x_q31product_even, x_q31product_odd, 0xCC);
+    const __m128i y_q31product = _mm_blend_epi16(y_q31product_even, y_q31product_odd, 0xCC);
+    const __m128i z_q31product = _mm_blend_epi16(z_q31product_even, z_q31product_odd, 0xCC);
+    const __m128i w_q31product = _mm_blend_epi16(w_q31product_even, w_q31product_odd, 0xCC);
+
+    const __m128i x_remainder =
+        _mm_add_epi32(_mm_and_si128(x_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), x_q31product));
+    const __m128i y_remainder =
+        _mm_add_epi32(_mm_and_si128(y_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), y_q31product));
+    const __m128i z_remainder =
+        _mm_add_epi32(_mm_and_si128(z_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), z_q31product));
+    const __m128i w_remainder =
+        _mm_add_epi32(_mm_and_si128(w_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), w_q31product));
+
+    const __m128i x_scaled =
+        _mm_sub_epi32(_mm_sra_epi32(x_q31product, vshift), _mm_cmpgt_epi32(x_remainder, vthreshold));
+    const __m128i y_scaled =
+        _mm_sub_epi32(_mm_sra_epi32(y_q31product, vshift), _mm_cmpgt_epi32(y_remainder, vthreshold));
+    const __m128i z_scaled =
+        _mm_sub_epi32(_mm_sra_epi32(z_q31product, vshift), _mm_cmpgt_epi32(z_remainder, vthreshold));
+    const __m128i w_scaled =
+        _mm_sub_epi32(_mm_sra_epi32(w_q31product, vshift), _mm_cmpgt_epi32(w_remainder, vthreshold));
+
+    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+    const __m128i xyzw_packed = _mm_packs_epi16(xy_packed, zw_packed);
+    const __m128i xyzw_clamped = _mm_max_epi8(_mm_min_epi8(xyzw_packed, vqmax), vqmin);
+
+    // 4x PSHUFD
+    // 8x PMULDQ
+    // 12x PADDQ
+    // 4x PADDD
+    // 4x PSUBD
+    // 4x PSLRQ (immediate)
+    // 4x PSRAD (register)
+    // 4x PBLENDW
+    // 4x PAND
+    // 4x PXOR (setzero)
+    // 8x PCMPGTD
+    // 2x PACKSSDW
+    // 2x PADDSW
+    // 1x PACKSSWB
+    // 1x PMAXSB
+    // 1x PMINSB
+    // ---------------------
+    // 67 instructions total
+
+    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+    output += 16;
+  }
+}

diff --git a/src/qs8-requantization/q31-ssse3.c b/src/qs8-requantization/q31-ssse3.c
new file mode 100644
index 0000000..6dad4a9
--- /dev/null
+++ b/src/qs8-requantization/q31-ssse3.c

@@ -0,0 +1,189 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <tmmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_q31__ssse3(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output)
+{
+  assert(n % 16 == 0);
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  // Compute requantization parameters.
+  const uint32_t scale_bits = fp32_to_bits(scale);
+
+  // Multiplier is in [0x40000000, 0x7FFFFF80] range.
+  const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+  assert(multiplier >= INT32_C(0x40000000));
+  assert(multiplier <= INT32_C(0x7FFFFF80));
+
+  // Shift is in [0, 31] range.
+  const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+  assert(shift >= 0);
+  assert(shift < 32);
+
+  const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+  const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+  const __m128i vqmin = _mm_set1_epi16((short) qmin);
+  const __m128i vqmax = _mm_set1_epi16((short) qmax);
+  const __m128i vshift = _mm_cvtsi32_si128((int) shift);
+  const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+  const __m128i vremainder_mask = _mm_set1_epi32((int) remainder_mask);
+  const __m128i vthreshold = _mm_set1_epi32((int) (remainder_mask >> 1));
+  const __m128i vq31rounding = _mm_set1_epi64x(UINT64_C(0x40000000));
+  for (; n != 0; n -= 16) {
+    const __m128i x = _mm_loadu_si128((const __m128i*) input);
+    const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+    const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+    const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+    input += 16;
+
+    const __m128i x_abs = _mm_abs_epi32(x);
+    const __m128i y_abs = _mm_abs_epi32(y);
+    const __m128i z_abs = _mm_abs_epi32(z);
+    const __m128i w_abs = _mm_abs_epi32(w);
+
+    const __m128i x_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), x);
+    const __m128i y_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), y);
+    const __m128i z_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), z);
+    const __m128i w_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), w);
+
+    const __m128i x_abs_rev = _mm_shuffle_epi32(x_abs, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i y_abs_rev = _mm_shuffle_epi32(y_abs, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i z_abs_rev = _mm_shuffle_epi32(z_abs, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m128i w_abs_rev = _mm_shuffle_epi32(w_abs, _MM_SHUFFLE(2, 3, 0, 1));
+
+    const __m128i x_abs_product_even = _mm_mul_epu32(x_abs, vmultiplier);
+    const __m128i y_abs_product_even = _mm_mul_epu32(y_abs, vmultiplier);
+    const __m128i z_abs_product_even = _mm_mul_epu32(z_abs, vmultiplier);
+    const __m128i w_abs_product_even = _mm_mul_epu32(w_abs, vmultiplier);
+
+    const __m128i x_neg_mask_even = _mm_shuffle_epi32(x_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i y_neg_mask_even = _mm_shuffle_epi32(y_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i z_neg_mask_even = _mm_shuffle_epi32(z_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i w_neg_mask_even = _mm_shuffle_epi32(w_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+
+    const __m128i x_product_even = _mm_sub_epi64(_mm_xor_si128(x_abs_product_even, x_neg_mask_even), x_neg_mask_even);
+    const __m128i y_product_even = _mm_sub_epi64(_mm_xor_si128(y_abs_product_even, y_neg_mask_even), y_neg_mask_even);
+    const __m128i z_product_even = _mm_sub_epi64(_mm_xor_si128(z_abs_product_even, z_neg_mask_even), z_neg_mask_even);
+    const __m128i w_product_even = _mm_sub_epi64(_mm_xor_si128(w_abs_product_even, w_neg_mask_even), w_neg_mask_even);
+
+    const __m128i x_rounded_product_even = _mm_add_epi64(x_product_even, vq31rounding);
+    const __m128i y_rounded_product_even = _mm_add_epi64(y_product_even, vq31rounding);
+    const __m128i z_rounded_product_even = _mm_add_epi64(z_product_even, vq31rounding);
+    const __m128i w_rounded_product_even = _mm_add_epi64(w_product_even, vq31rounding);
+
+    const __m128i x_abs_product_odd = _mm_mul_epu32(x_abs_rev, vmultiplier);
+    const __m128i y_abs_product_odd = _mm_mul_epu32(y_abs_rev, vmultiplier);
+    const __m128i z_abs_product_odd = _mm_mul_epu32(z_abs_rev, vmultiplier);
+    const __m128i w_abs_product_odd = _mm_mul_epu32(w_abs_rev, vmultiplier);
+
+    const __m128i x_neg_mask_odd = _mm_shuffle_epi32(x_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128i y_neg_mask_odd = _mm_shuffle_epi32(y_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128i z_neg_mask_odd = _mm_shuffle_epi32(z_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+    const __m128i w_neg_mask_odd = _mm_shuffle_epi32(w_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+
+    const __m128i x_product_odd = _mm_sub_epi64(_mm_xor_si128(x_abs_product_odd, x_neg_mask_odd), x_neg_mask_odd);
+    const __m128i y_product_odd = _mm_sub_epi64(_mm_xor_si128(y_abs_product_odd, y_neg_mask_odd), y_neg_mask_odd);
+    const __m128i z_product_odd = _mm_sub_epi64(_mm_xor_si128(z_abs_product_odd, z_neg_mask_odd), z_neg_mask_odd);
+    const __m128i w_product_odd = _mm_sub_epi64(_mm_xor_si128(w_abs_product_odd, w_neg_mask_odd), w_neg_mask_odd);
+
+    const __m128i x_rounded_product_odd = _mm_add_epi64(x_product_odd, vq31rounding);
+    const __m128i y_rounded_product_odd = _mm_add_epi64(y_product_odd, vq31rounding);
+    const __m128i z_rounded_product_odd = _mm_add_epi64(z_product_odd, vq31rounding);
+    const __m128i w_rounded_product_odd = _mm_add_epi64(w_product_odd, vq31rounding);
+
+    const __m128i x_q31product_even = _mm_srli_epi64(x_rounded_product_even, 31);
+    const __m128i x_q31product_odd = _mm_srli_epi64(x_rounded_product_odd, 31);
+    const __m128i y_q31product_even = _mm_srli_epi64(y_rounded_product_even, 31);
+    const __m128i y_q31product_odd = _mm_srli_epi64(y_rounded_product_odd, 31);
+    const __m128i z_q31product_even = _mm_srli_epi64(z_rounded_product_even, 31);
+    const __m128i z_q31product_odd = _mm_srli_epi64(z_rounded_product_odd, 31);
+    const __m128i w_q31product_even = _mm_srli_epi64(w_rounded_product_even, 31);
+    const __m128i w_q31product_odd = _mm_srli_epi64(w_rounded_product_odd, 31);
+
+    const __m128i x_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+        _mm_castsi128_ps(x_q31product_even), _mm_castsi128_ps(x_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+    const __m128i y_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+        _mm_castsi128_ps(y_q31product_even), _mm_castsi128_ps(y_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+    const __m128i z_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+        _mm_castsi128_ps(z_q31product_even), _mm_castsi128_ps(z_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+    const __m128i w_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+        _mm_castsi128_ps(w_q31product_even), _mm_castsi128_ps(w_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+
+    const __m128i x_q31product = _mm_shuffle_epi32(x_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+    const __m128i y_q31product = _mm_shuffle_epi32(y_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+    const __m128i z_q31product = _mm_shuffle_epi32(z_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+    const __m128i w_q31product = _mm_shuffle_epi32(w_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+
+    const __m128i x_remainder =
+        _mm_add_epi32(_mm_and_si128(x_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), x_q31product));
+    const __m128i y_remainder =
+        _mm_add_epi32(_mm_and_si128(y_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), y_q31product));
+    const __m128i z_remainder =
+        _mm_add_epi32(_mm_and_si128(z_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), z_q31product));
+    const __m128i w_remainder =
+        _mm_add_epi32(_mm_and_si128(w_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), w_q31product));
+
+    const __m128i x_scaled =
+        _mm_sub_epi32(_mm_sra_epi32(x_q31product, vshift), _mm_cmpgt_epi32(x_remainder, vthreshold));
+    const __m128i y_scaled =
+        _mm_sub_epi32(_mm_sra_epi32(y_q31product, vshift), _mm_cmpgt_epi32(y_remainder, vthreshold));
+    const __m128i z_scaled =
+        _mm_sub_epi32(_mm_sra_epi32(z_q31product, vshift), _mm_cmpgt_epi32(z_remainder, vthreshold));
+    const __m128i w_scaled =
+        _mm_sub_epi32(_mm_sra_epi32(w_q31product, vshift), _mm_cmpgt_epi32(w_remainder, vthreshold));
+
+    const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+    const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+    const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin);
+    const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin);
+    const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped);
+
+    // 16x PSHUFD
+    // 4x SHUFPS
+    // 8x PMULUDQ
+    // 8x PXOR (setzero)
+    // 8x PXOR
+    // 4x PAND
+    // 8x PADDQ
+    // 4x PADDD
+    // 8x PSUBQ
+    // 4x PSUBD
+    // 8x PSRLQ (immediate)
+    // 4x PSRAD (register)
+    // 12x PCMPGTD
+    // 4x PABSD
+    // 2x PACKSSDW
+    // 2x PADDSW
+    // 2x PMAXSW
+    // 2x PMINSW
+    // 1x PACKSSWB
+    // ---------------------
+    // 109 instructions total
+
+    _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+    output += 16;
+  }
+}

diff --git a/src/xnnpack/requantization-stubs.h b/src/xnnpack/requantization-stubs.h
index bab892b..64f24ae 100644
--- a/src/xnnpack/requantization-stubs.h
+++ b/src/xnnpack/requantization-stubs.h

@@ -17,6 +17,7 @@
 extern "C" {
 #endif
 
+
 typedef void (*xnn_qu8_requantization_function)(
     size_t n,
     const int32_t* input,
@@ -59,11 +60,50 @@
 DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_q31__neon)
 DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_q31__psimd)
 
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__scalar)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__sse2)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__ssse3)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__sse4)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__neon)
+
+typedef void (*xnn_qs8_requantization_function)(
+    size_t n,
+    const int32_t* input,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax,
+    int8_t* output);
+
+#define DECLARE_QS8_REQUANTIZATION_FUNCTION(fn_name) \
+    void fn_name(                                    \
+        size_t n,                                    \
+        const int32_t* input,                        \
+        float scale,                                 \
+        int8_t zero_point,                           \
+        int8_t qmin,                                 \
+        int8_t qmax,                                 \
+        int8_t* output);
+
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__scalar_unsigned32)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__scalar_unsigned64)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__scalar_signed64)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__sse2)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__ssse3)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__sse4)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__neon)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__psimd)
+
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__scalar_lrintf)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__scalar_magic)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__sse2)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__sse4)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__neon)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__psimd)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__wasmsimd)
+
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_q31__scalar)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_q31__sse2)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_q31__ssse3)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_q31__sse4)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_q31__neon)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_q31__psimd)
+
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
index ecfbe9a..50c6d1d 100644
--- a/src/xnnpack/requantization.h
+++ b/src/xnnpack/requantization.h

@@ -45,6 +45,102 @@
   return (uint8_t) (n + params.scalar.zero_point);
 }
 
+inline static uint8_t xnn_qu8_requantize_precise(
+  int32_t value,
+  float scale,
+  uint8_t zero_point,
+  uint8_t qmin,
+  uint8_t qmax)
+{
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
+  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+  assert(shift >= 24);
+  assert(shift < 56);
+
+  // Compute absolute value of input as unsigned 32-bit int.
+  // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
+  const uint32_t abs_value = (value >= 0) ? (uint32_t) value : -(uint32_t) value;
+
+  // Compute full 64-bit product of 32-bit factors
+  const uint64_t product = (uint64_t) abs_value * (uint64_t) multiplier;
+
+  // Shift the full 64-bit product right with rounding.
+  // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+  const uint64_t rounding = UINT64_C(1) << (shift - 1);
+  const uint32_t abs_scaled_value = (uint32_t) ((product + rounding) >> shift);
+
+  // Copy the sign of input to scaled absolute input value.
+  const int32_t scaled_value = (int32_t) (value >= 0 ? abs_scaled_value : -abs_scaled_value);
+
+  // Clamp scaled value with zero point between smin and smax.
+  int32_t clamped_value = scaled_value;
+  const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point;
+  if (clamped_value < smin) {
+    clamped_value = smin;
+  }
+  const int32_t smax = (int32_t) (uint32_t) qmax - (int32_t) (uint32_t) zero_point;
+  if (clamped_value > smax) {
+    clamped_value = smax;
+  }
+
+  // Add zero point to clamped value.
+  const int32_t biased_value = clamped_value + (int32_t) (uint32_t) zero_point;
+
+  return biased_value;
+}
+
+inline static int8_t xnn_qs8_requantize_precise(
+  int32_t value,
+  float scale,
+  int8_t zero_point,
+  int8_t qmin,
+  int8_t qmax)
+{
+  assert(scale < 1.0f);
+  assert(scale >= 0x1.0p-32f);
+
+  const uint32_t scale_bits = fp32_to_bits(scale);
+  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
+  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+  assert(shift >= 24);
+  assert(shift < 56);
+
+  // Compute absolute value of input as unsigned 32-bit int.
+  // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
+  const uint32_t abs_value = (value >= 0) ? (uint32_t) value : -(uint32_t) value;
+
+  // Compute full 64-bit product of 32-bit factors
+  const uint64_t product = (uint64_t) abs_value * (uint64_t) multiplier;
+
+  // Shift the full 64-bit product right with rounding.
+  // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+  const uint64_t rounding = UINT64_C(1) << (shift - 1);
+  const uint32_t abs_scaled_value = (uint32_t) ((product + rounding) >> shift);
+
+  // Copy the sign of input to scaled absolute input value.
+  const int32_t scaled_value = (int32_t) (value >= 0 ? abs_scaled_value : -abs_scaled_value);
+
+  // Clamp scaled value with zero point between smin and smax.
+  int32_t clamped_value = scaled_value;
+  const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
+  if (clamped_value < smin) {
+    clamped_value = smin;
+  }
+  const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
+  if (clamped_value > smax) {
+    clamped_value = smax;
+  }
+
+  // Add zero point to clamped value.
+  const int32_t biased_value = clamped_value + (int32_t) zero_point;
+
+  return biased_value;
+}
+
 static inline uint8_t xnn_qu8_quantize_avgpool(
   int32_t n,
   union xnn_qu8_avgpool_params params)

diff --git a/src/xnnpack/scalar-utils.h b/src/xnnpack/scalar-utils.h
index dab3f82..341a4ae 100644
--- a/src/xnnpack/scalar-utils.h
+++ b/src/xnnpack/scalar-utils.h

@@ -68,51 +68,3 @@
     return x >> n;
   #endif
 }
-
-inline static uint8_t scalar_requantize_precise(
-  int32_t value,
-  float scale,
-  uint8_t zero_point,
-  uint8_t qmin,
-  uint8_t qmax)
-{
-  assert(scale < 1.0f);
-  assert(scale >= 0x1.0p-32f);
-
-  const uint32_t scale_bits = fp32_to_bits(scale);
-  const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
-  const uint32_t shift = 127 + 23 - (scale_bits >> 23);
-  assert(shift >= 24);
-  assert(shift < 56);
-
-  // Compute absolute value of input as unsigned 32-bit int.
-  // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
-  const uint32_t abs_value = (value >= 0) ? (uint32_t) value : -(uint32_t) value;
-
-  // Compute full 64-bit product of 32-bit factors
-  const uint64_t product = (uint64_t) abs_value * (uint64_t) multiplier;
-
-  // Shift the full 64-bit product right with rounding.
-  // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
-  const uint64_t rounding = UINT64_C(1) << (shift - 1);
-  const uint32_t abs_scaled_value = (uint32_t) ((product + rounding) >> shift);
-
-  // Copy the sign of input to scaled absolute input value.
-  const int32_t scaled_value = (int32_t) (value >= 0 ? abs_scaled_value : -abs_scaled_value);
-
-  // Clamp scaled value with zero point between smin and smax.
-  int32_t clamped_value = scaled_value;
-  const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point;
-  if (clamped_value < smin) {
-    clamped_value = smin;
-  }
-  const int32_t smax = (int32_t) (uint32_t) qmax - (int32_t) (uint32_t) zero_point;
-  if (clamped_value > smax) {
-    clamped_value = smax;
-  }
-
-  // Add zero point to clamped value.
-  const int32_t biased_value = clamped_value + (int32_t) (uint32_t) zero_point;
-
-  return biased_value;
-}

diff --git a/test/qs8-requantization.cc b/test/qs8-requantization.cc
new file mode 100644
index 0000000..ed90f4e
--- /dev/null
+++ b/test/qs8-requantization.cc

@@ -0,0 +1,1192 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+
+#include <xnnpack/requantization-stubs.h>
+#include "requantization-tester.h"
+
+
+/*
+ * Precise scalar implementation using unsigned 32-bit arithmetics.
+ */
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, exact_divide_by_po2) {
+  for (uint32_t s = 1; s < 32; s++) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .s(s)
+      .TestExactDivideByPO2(xnn_qs8_requantize_precise__scalar_unsigned32);
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, exact_divide_by_po2_with_zero_point) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestExactDivideByPO2(xnn_qs8_requantize_precise__scalar_unsigned32);
+    }
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_up) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__scalar_unsigned32);
+    }
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_down) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__scalar_unsigned32);
+    }
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_away) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__scalar_unsigned32);
+    }
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, special_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .TestSpecialCases(xnn_qs8_requantize_precise__scalar_unsigned32);
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, random_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .iterations(100)
+    .TestRandomCasesPrecise(xnn_qs8_requantize_precise__scalar_unsigned32);
+}
+
+
+/*
+ * Precise scalar implementation using unsigned 64-bit arithmetics.
+ */
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, exact_divide_by_po2) {
+  for (uint32_t s = 1; s < 32; s++) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .s(s)
+      .TestExactDivideByPO2(xnn_qs8_requantize_precise__scalar_unsigned64);
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, exact_divide_by_po2_with_zero_point) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestExactDivideByPO2(xnn_qs8_requantize_precise__scalar_unsigned64);
+    }
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_up) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__scalar_unsigned64);
+    }
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_down) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__scalar_unsigned64);
+    }
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_away) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__scalar_unsigned64);
+    }
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, special_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .TestSpecialCases(xnn_qs8_requantize_precise__scalar_unsigned64);
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, random_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .iterations(100)
+    .TestRandomCasesPrecise(xnn_qs8_requantize_precise__scalar_unsigned64);
+}
+
+
+/*
+ * Precise scalar implementation using signed 64-bit arithmetics.
+ */
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, exact_divide_by_po2) {
+  for (uint32_t s = 1; s < 32; s++) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .s(s)
+      .TestExactDivideByPO2(xnn_qs8_requantize_precise__scalar_signed64);
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, exact_divide_by_po2_with_zero_point) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestExactDivideByPO2(xnn_qs8_requantize_precise__scalar_signed64);
+    }
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, divide_by_po2_with_rounding_up) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__scalar_signed64);
+    }
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, divide_by_po2_with_rounding_down) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__scalar_signed64);
+    }
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, divide_by_po2_with_rounding_away) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__scalar_signed64);
+    }
+  }
+}
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, special_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .TestSpecialCases(xnn_qs8_requantize_precise__scalar_signed64);
+}
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, random_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .iterations(100)
+    .TestRandomCasesPrecise(xnn_qs8_requantize_precise__scalar_signed64);
+}
+
+
+/*
+ * FP32-based scalar implementation using lrintf function.
+ */
+
+TEST(QS8_FP32__SCALAR_LRINTF, random_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .iterations(1000)
+    .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_lrintf);
+}
+
+
+/*
+ * FP32-based scalar implementation using magic trick for FP32->INT32 conversion.
+ */
+
+TEST(QS8_FP32__SCALAR_MAGIC, random_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .iterations(1000)
+    .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_magic);
+}
+
+
+/*
+ * Q31-based scalar implementation.
+ */
+
+TEST(QS8_Q31__SCALAR, exact_divide_by_po2) {
+  for (uint32_t s = 1; s < 32; s++) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .s(s)
+      .TestExactDivideByPO2(xnn_qs8_requantize_q31__scalar);
+  }
+}
+
+TEST(QS8_Q31__SCALAR, exact_divide_by_po2_with_zero_point) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestExactDivideByPO2(xnn_qs8_requantize_q31__scalar);
+    }
+  }
+}
+
+TEST(QS8_Q31__SCALAR, divide_by_po2_with_rounding_up) {
+  for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+       zero_point <= std::numeric_limits<int8_t>::max();
+       zero_point++)
+  {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .zero_point(zero_point)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_q31__scalar);
+    }
+  }
+}
+
+/* No rounding down test - it fails because of upward bias in multiplication */
+/* No rounding away test - it fails because of upward bias in multiplication */
+
+TEST(QS8_Q31__SCALAR, special_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .TestSpecialCases(xnn_qs8_requantize_q31__scalar);
+}
+
+TEST(QS8_Q31__SCALAR, random_cases) {
+  RequantizationTester()
+    .qmin(std::numeric_limits<int8_t>::min())
+    .qmax(std::numeric_limits<int8_t>::max())
+    .iterations(100)
+    .TestRandomCasesApproximate(xnn_qs8_requantize_q31__scalar);
+}
+
+
+#if !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
+  /*
+   * Precise PSIMD implementation using unsigned 32-bit arithmetics.
+   */
+
+  TEST(QS8_PRECISE__PSIMD, exact_divide_by_po2) {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestExactDivideByPO2(xnn_qs8_requantize_precise__psimd);
+    }
+  }
+
+  TEST(QS8_PRECISE__PSIMD, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestExactDivideByPO2(xnn_qs8_requantize_precise__psimd);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__PSIMD, divide_by_po2_with_rounding_up) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__psimd);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__PSIMD, divide_by_po2_with_rounding_down) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__psimd);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__PSIMD, divide_by_po2_with_rounding_away) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__psimd);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__PSIMD, special_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .TestSpecialCases(xnn_qs8_requantize_precise__psimd);
+  }
+
+  TEST(QS8_PRECISE__PSIMD, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(100)
+      .TestRandomCasesPrecise(xnn_qs8_requantize_precise__psimd);
+  }
+
+
+  /*
+   * FP32-based PSIMD implementation using magic trick for FP32->INT32 conversion.
+   */
+
+  TEST(QS8_FP32__PSIMD, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(1000)
+      .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__psimd);
+  }
+#endif  // !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+  /*
+   * Precise SSE2 implementation using floating-point shuffle.
+   */
+
+  TEST(QS8_PRECISE__SSE2, exact_divide_by_po2) {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestExactDivideByPO2(xnn_qs8_requantize_precise__sse2);
+    }
+  }
+
+  TEST(QS8_PRECISE__SSE2, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestExactDivideByPO2(xnn_qs8_requantize_precise__sse2);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__SSE2, divide_by_po2_with_rounding_up) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__sse2);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__SSE2, divide_by_po2_with_rounding_down) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__sse2);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__SSE2, divide_by_po2_with_rounding_away) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__sse2);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__SSE2, special_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .TestSpecialCases(xnn_qs8_requantize_precise__sse2);
+  }
+
+  TEST(QS8_PRECISE__SSE2, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(100)
+      .TestRandomCasesPrecise(xnn_qs8_requantize_precise__sse2);
+  }
+
+
+  /*
+   * Precise SSSE3 implementation using floating-point shuffle.
+   */
+
+  TEST(QS8_PRECISE__SSSE3, exact_divide_by_po2) {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestExactDivideByPO2(xnn_qs8_requantize_precise__ssse3);
+    }
+  }
+
+  TEST(QS8_PRECISE__SSSE3, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestExactDivideByPO2(xnn_qs8_requantize_precise__ssse3);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__SSSE3, divide_by_po2_with_rounding_up) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__ssse3);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__SSSE3, divide_by_po2_with_rounding_down) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__ssse3);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__SSSE3, divide_by_po2_with_rounding_away) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__ssse3);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__SSSE3, special_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .TestSpecialCases(xnn_qs8_requantize_precise__ssse3);
+  }
+
+  TEST(QS8_PRECISE__SSSE3, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(100)
+      .TestRandomCasesPrecise(xnn_qs8_requantize_precise__ssse3);
+  }
+
+
+  /*
+   * Precise SSE4.1 implementation using static blend instruction.
+   */
+
+  TEST(QS8_PRECISE__SSE4, exact_divide_by_po2) {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestExactDivideByPO2(xnn_qs8_requantize_precise__sse4);
+    }
+  }
+
+  TEST(QS8_PRECISE__SSE4, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestExactDivideByPO2(xnn_qs8_requantize_precise__sse4);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__SSE4, divide_by_po2_with_rounding_up) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__sse4);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__SSE4, divide_by_po2_with_rounding_down) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__sse4);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__SSE4, divide_by_po2_with_rounding_away) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__sse4);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__SSE4, special_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .TestSpecialCases(xnn_qs8_requantize_precise__sse4);
+  }
+
+  TEST(QS8_PRECISE__SSE4, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(100)
+      .TestRandomCasesPrecise(xnn_qs8_requantize_precise__sse4);
+  }
+
+
+  /*
+   * FP32-based x86 SSE2 implementation.
+   */
+
+  TEST(QS8_FP32__SSE2, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(1000)
+      .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__sse2);
+  }
+
+
+  /*
+   * FP32-based x86 SSE4 implementation.
+   */
+
+  TEST(QS8_FP32__SSE4, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(1000)
+      .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__sse4);
+  }
+
+
+  /*
+   * Q31-based x86 SSE2 implementation.
+   */
+
+  TEST(QS8_Q31__SSE2, exact_divide_by_po2) {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestExactDivideByPO2(xnn_qs8_requantize_q31__sse2);
+    }
+  }
+
+  TEST(QS8_Q31__SSE2, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestExactDivideByPO2(xnn_qs8_requantize_q31__sse2);
+      }
+    }
+  }
+
+  TEST(QS8_Q31__SSE2, divide_by_po2_with_rounding_up) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_q31__sse2);
+      }
+    }
+  }
+
+  /* No rounding down test - it fails because of upward bias in multiplication */
+  /* No rounding away test - it fails because of upward bias in multiplication */
+
+  TEST(QS8_Q31__SSE2, special_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .TestSpecialCases(xnn_qs8_requantize_q31__sse2);
+  }
+
+  TEST(QS8_Q31__SSE2, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(100)
+      .TestRandomCasesApproximate(xnn_qs8_requantize_q31__sse2);
+  }
+
+
+  /*
+   * Q31-based x86 SSSE3 implementation.
+   */
+
+  TEST(QS8_Q31__SSSE3, exact_divide_by_po2) {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestExactDivideByPO2(xnn_qs8_requantize_q31__ssse3);
+    }
+  }
+
+  TEST(QS8_Q31__SSSE3, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestExactDivideByPO2(xnn_qs8_requantize_q31__ssse3);
+      }
+    }
+  }
+
+  TEST(QS8_Q31__SSSE3, divide_by_po2_with_rounding_up) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_q31__ssse3);
+      }
+    }
+  }
+
+  /* No rounding down test - it fails because of upward bias in multiplication */
+  /* No rounding away test - it fails because of upward bias in multiplication */
+
+  TEST(QS8_Q31__SSSE3, special_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .TestSpecialCases(xnn_qs8_requantize_q31__ssse3);
+  }
+
+  TEST(QS8_Q31__SSSE3, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(100)
+      .TestRandomCasesApproximate(xnn_qs8_requantize_q31__ssse3);
+  }
+
+
+  /*
+   * Q31-based x86 SSE4 implementation.
+   */
+
+  TEST(QS8_Q31__SSE4, exact_divide_by_po2) {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestExactDivideByPO2(xnn_qs8_requantize_q31__sse4);
+    }
+  }
+
+  TEST(QS8_Q31__SSE4, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestExactDivideByPO2(xnn_qs8_requantize_q31__sse4);
+      }
+    }
+  }
+
+  TEST(QS8_Q31__SSE4, divide_by_po2_with_rounding_up) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_q31__sse4);
+      }
+    }
+  }
+
+  /* No rounding down test - it fails because of upward bias in multiplication */
+  /* No rounding away test - it fails because of upward bias in multiplication */
+
+  TEST(QS8_Q31__SSE4, special_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .TestSpecialCases(xnn_qs8_requantize_q31__sse4);
+  }
+
+  TEST(QS8_Q31__SSE4, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(100)
+      .TestRandomCasesApproximate(xnn_qs8_requantize_q31__sse4);
+  }
+#endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+  /*
+   * Precise ARM NEON implementation.
+   */
+
+  TEST(QS8_PRECISE__NEON, exact_divide_by_po2) {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .s(s)
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .TestExactDivideByPO2(xnn_qs8_requantize_precise__neon);
+    }
+  }
+
+  TEST(QS8_PRECISE__NEON, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestExactDivideByPO2(xnn_qs8_requantize_precise__neon);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__NEON, divide_by_po2_with_rounding_up) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__neon);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__NEON, divide_by_po2_with_rounding_down) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__neon);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__NEON, divide_by_po2_with_rounding_away) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__neon);
+      }
+    }
+  }
+
+  TEST(QS8_PRECISE__NEON, special_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .TestSpecialCases(xnn_qs8_requantize_precise__neon);
+  }
+
+  TEST(QS8_PRECISE__NEON, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(100)
+      .TestRandomCasesPrecise(xnn_qs8_requantize_precise__neon);
+  }
+
+
+  /*
+   * FP32-based ARM NEON implementation.
+   */
+
+  TEST(QS8_FP32__NEON, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(1000)
+      .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__neon);
+  }
+
+
+  /*
+   * Q31-based ARM NEON implementation.
+   */
+
+  TEST(QS8_Q31__NEON, exact_divide_by_po2) {
+    for (uint32_t s = 1; s < 32; s++) {
+      RequantizationTester()
+        .qmin(std::numeric_limits<int8_t>::min())
+        .qmax(std::numeric_limits<int8_t>::max())
+        .s(s)
+        .TestExactDivideByPO2(xnn_qs8_requantize_q31__neon);
+    }
+  }
+
+  TEST(QS8_Q31__NEON, exact_divide_by_po2_with_zero_point) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestExactDivideByPO2(xnn_qs8_requantize_q31__neon);
+      }
+    }
+  }
+
+  TEST(QS8_Q31__NEON, divide_by_po2_with_rounding_up) {
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      for (uint32_t s = 1; s < 32; s++) {
+        RequantizationTester()
+          .zero_point(zero_point)
+          .qmin(std::numeric_limits<int8_t>::min())
+          .qmax(std::numeric_limits<int8_t>::max())
+          .s(s)
+          .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_q31__neon);
+      }
+    }
+  }
+
+  /* No rounding down test - it fails because of upward bias in multiplication */
+  /* No rounding away test - it fails because of upward bias in multiplication */
+
+  TEST(QS8_Q31__NEON, special_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .TestSpecialCases(xnn_qs8_requantize_q31__neon);
+  }
+
+  TEST(QS8_Q31__NEON, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(100)
+      .TestRandomCasesApproximate(xnn_qs8_requantize_q31__neon);
+  }
+#endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+#if XNN_ARCH_WASMSIMD
+  /*
+   * FP32-based ARM NEON implementation.
+   */
+
+  TEST(QS8_FP32__WASMSIMD, random_cases) {
+    RequantizationTester()
+      .qmin(std::numeric_limits<int8_t>::min())
+      .qmax(std::numeric_limits<int8_t>::max())
+      .iterations(1000)
+      .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__wasmsimd);
+  }
+#endif  // XNN_ARCH_WASMSIMD

diff --git a/test/qu8-requantization.cc b/test/qu8-requantization.cc
index 53cb190..bfcf313 100644
--- a/test/qu8-requantization.cc
+++ b/test/qu8-requantization.cc

@@ -25,6 +25,8 @@
 TEST(QU8_PRECISE__SCALAR_UNSIGNED32, exact_divide_by_po2) {
   for (uint32_t s = 1; s < 32; s++) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .s(s)
       .TestExactDivideByPO2(xnn_qu8_requantize_precise__scalar_unsigned32);
   }
@@ -35,6 +37,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_precise__scalar_unsigned32);
     }
@@ -46,6 +50,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__scalar_unsigned32);
     }
@@ -57,6 +63,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__scalar_unsigned32);
     }
@@ -68,6 +76,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__scalar_unsigned32);
     }
@@ -76,11 +86,16 @@
 
 TEST(QU8_PRECISE__SCALAR_UNSIGNED32, special_cases) {
   RequantizationTester()
+    .qmin(std::numeric_limits<uint8_t>::min())
+    .qmax(std::numeric_limits<uint8_t>::max())
     .TestSpecialCases(xnn_qu8_requantize_precise__scalar_unsigned32);
 }
 
 TEST(QU8_PRECISE__SCALAR_UNSIGNED32, random_cases) {
   RequantizationTester()
+    .qmin(std::numeric_limits<uint8_t>::min())
+    .qmax(std::numeric_limits<uint8_t>::max())
+    .zero_point(128)
     .iterations(100)
     .TestRandomCasesPrecise(xnn_qu8_requantize_precise__scalar_unsigned32);
 }
@@ -93,6 +108,8 @@
 TEST(QU8_PRECISE__SCALAR_UNSIGNED64, exact_divide_by_po2) {
   for (uint32_t s = 1; s < 32; s++) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .s(s)
       .TestExactDivideByPO2(xnn_qu8_requantize_precise__scalar_unsigned64);
   }
@@ -103,6 +120,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_precise__scalar_unsigned64);
     }
@@ -114,6 +133,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__scalar_unsigned64);
     }
@@ -125,6 +146,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__scalar_unsigned64);
     }
@@ -136,6 +159,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__scalar_unsigned64);
     }
@@ -144,11 +169,16 @@
 
 TEST(QU8_PRECISE__SCALAR_UNSIGNED64, special_cases) {
   RequantizationTester()
+    .qmin(std::numeric_limits<uint8_t>::min())
+    .qmax(std::numeric_limits<uint8_t>::max())
     .TestSpecialCases(xnn_qu8_requantize_precise__scalar_unsigned64);
 }
 
 TEST(QU8_PRECISE__SCALAR_UNSIGNED64, random_cases) {
   RequantizationTester()
+    .qmin(std::numeric_limits<uint8_t>::min())
+    .qmax(std::numeric_limits<uint8_t>::max())
+    .zero_point(128)
     .iterations(100)
     .TestRandomCasesPrecise(xnn_qu8_requantize_precise__scalar_unsigned64);
 }
@@ -161,6 +191,8 @@
 TEST(QU8_PRECISE__SCALAR_SIGNED64, exact_divide_by_po2) {
   for (uint32_t s = 1; s < 32; s++) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .s(s)
       .TestExactDivideByPO2(xnn_qu8_requantize_precise__scalar_signed64);
   }
@@ -171,6 +203,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_precise__scalar_signed64);
     }
@@ -182,6 +216,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__scalar_signed64);
     }
@@ -193,6 +229,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__scalar_signed64);
     }
@@ -204,6 +242,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__scalar_signed64);
     }
@@ -212,11 +252,16 @@
 
 TEST(QU8_PRECISE__SCALAR_SIGNED64, special_cases) {
   RequantizationTester()
+    .qmin(std::numeric_limits<uint8_t>::min())
+    .qmax(std::numeric_limits<uint8_t>::max())
     .TestSpecialCases(xnn_qu8_requantize_precise__scalar_signed64);
 }
 
 TEST(QU8_PRECISE__SCALAR_SIGNED64, random_cases) {
   RequantizationTester()
+    .qmin(std::numeric_limits<uint8_t>::min())
+    .qmax(std::numeric_limits<uint8_t>::max())
+    .zero_point(128)
     .iterations(100)
     .TestRandomCasesPrecise(xnn_qu8_requantize_precise__scalar_signed64);
 }
@@ -228,6 +273,8 @@
 
 TEST(QU8_FP32__SCALAR_LRINTF, random_cases) {
   RequantizationTester()
+    .qmin(std::numeric_limits<uint8_t>::min())
+    .qmax(std::numeric_limits<uint8_t>::max())
     .iterations(1000)
     .TestRandomCasesApproximate(xnn_qu8_requantize_fp32__scalar_lrintf);
 }
@@ -239,6 +286,8 @@
 
 TEST(QU8_FP32__SCALAR_MAGIC, random_cases) {
   RequantizationTester()
+    .qmin(std::numeric_limits<uint8_t>::min())
+    .qmax(std::numeric_limits<uint8_t>::max())
     .iterations(1000)
     .TestRandomCasesApproximate(xnn_qu8_requantize_fp32__scalar_magic);
 }
@@ -251,6 +300,8 @@
 TEST(QU8_Q31__SCALAR, exact_divide_by_po2) {
   for (uint32_t s = 1; s < 32; s++) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .s(s)
       .TestExactDivideByPO2(xnn_qu8_requantize_q31__scalar);
   }
@@ -261,6 +312,8 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_q31__scalar);
     }
@@ -272,32 +325,28 @@
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
         .zero_point(zero_point)
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_q31__scalar);
     }
   }
 }
 
-/* No rounding down Test - it fails because of upward bias in multiplication */
-
-TEST(QU8_Q31__SCALAR, divide_by_po2_with_rounding_away) {
-  for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-    for (uint32_t s = 1; s < 32; s++) {
-      RequantizationTester()
-        .zero_point(zero_point)
-        .s(s)
-        .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_q31__scalar);
-    }
-  }
-}
+/* No rounding down test - it fails because of upward bias in multiplication */
+/* No rounding away test - it fails because of upward bias in multiplication */
 
 TEST(QU8_Q31__SCALAR, special_cases) {
   RequantizationTester()
+    .qmin(std::numeric_limits<uint8_t>::min())
+    .qmax(std::numeric_limits<uint8_t>::max())
     .TestSpecialCases(xnn_qu8_requantize_q31__scalar);
 }
 
 TEST(QU8_Q31__SCALAR, random_cases) {
   RequantizationTester()
+    .qmin(std::numeric_limits<uint8_t>::min())
+    .qmax(std::numeric_limits<uint8_t>::max())
     .iterations(100)
     .TestRandomCasesApproximate(xnn_qu8_requantize_q31__scalar);
 }
@@ -311,6 +360,8 @@
   TEST(QU8_PRECISE__PSIMD, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_precise__psimd);
     }
@@ -321,6 +372,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestExactDivideByPO2(xnn_qu8_requantize_precise__psimd);
       }
@@ -332,6 +385,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__psimd);
       }
@@ -343,6 +398,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__psimd);
       }
@@ -354,6 +411,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__psimd);
       }
@@ -362,11 +421,16 @@
 
   TEST(QU8_PRECISE__PSIMD, special_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .TestSpecialCases(xnn_qu8_requantize_precise__psimd);
   }
 
   TEST(QU8_PRECISE__PSIMD, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
+      .zero_point(128)
       .iterations(100)
       .TestRandomCasesPrecise(xnn_qu8_requantize_precise__psimd);
   }
@@ -378,6 +442,8 @@
 
   TEST(QU8_FP32__PSIMD, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .iterations(1000)
       .TestRandomCasesApproximate(xnn_qu8_requantize_fp32__psimd);
   }
@@ -392,6 +458,8 @@
   TEST(QU8_PRECISE__SSE2, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_precise__sse2);
     }
@@ -402,6 +470,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestExactDivideByPO2(xnn_qu8_requantize_precise__sse2);
       }
@@ -413,6 +483,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__sse2);
       }
@@ -424,6 +496,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__sse2);
       }
@@ -435,6 +509,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__sse2);
       }
@@ -443,11 +519,16 @@
 
   TEST(QU8_PRECISE__SSE2, special_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .TestSpecialCases(xnn_qu8_requantize_precise__sse2);
   }
 
   TEST(QU8_PRECISE__SSE2, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
+      .zero_point(128)
       .iterations(100)
       .TestRandomCasesPrecise(xnn_qu8_requantize_precise__sse2);
   }
@@ -460,6 +541,8 @@
   TEST(QU8_PRECISE__SSSE3, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_precise__ssse3);
     }
@@ -470,6 +553,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestExactDivideByPO2(xnn_qu8_requantize_precise__ssse3);
       }
@@ -481,6 +566,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__ssse3);
       }
@@ -492,6 +579,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__ssse3);
       }
@@ -503,6 +592,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__ssse3);
       }
@@ -511,11 +602,16 @@
 
   TEST(QU8_PRECISE__SSSE3, special_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .TestSpecialCases(xnn_qu8_requantize_precise__ssse3);
   }
 
   TEST(QU8_PRECISE__SSSE3, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
+      .zero_point(128)
       .iterations(100)
       .TestRandomCasesPrecise(xnn_qu8_requantize_precise__ssse3);
   }
@@ -528,6 +624,8 @@
   TEST(QU8_PRECISE__SSE4, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_precise__sse4);
     }
@@ -538,6 +636,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestExactDivideByPO2(xnn_qu8_requantize_precise__sse4);
       }
@@ -549,6 +649,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__sse4);
       }
@@ -560,6 +662,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__sse4);
       }
@@ -571,6 +675,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__sse4);
       }
@@ -579,11 +685,16 @@
 
   TEST(QU8_PRECISE__SSE4, special_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .TestSpecialCases(xnn_qu8_requantize_precise__sse4);
   }
 
   TEST(QU8_PRECISE__SSE4, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
+      .zero_point(128)
       .iterations(100)
       .TestRandomCasesPrecise(xnn_qu8_requantize_precise__sse4);
   }
@@ -595,6 +706,8 @@
 
   TEST(QU8_FP32__SSE2, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .iterations(1000)
       .TestRandomCasesApproximate(xnn_qu8_requantize_fp32__sse2);
   }
@@ -607,6 +720,8 @@
   TEST(QU8_Q31__SSE2, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_q31__sse2);
     }
@@ -617,6 +732,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestExactDivideByPO2(xnn_qu8_requantize_q31__sse2);
       }
@@ -628,32 +745,28 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_q31__sse2);
       }
     }
   }
 
-  /* No rounding down Test - it fails because of upward bias in multiplication */
-
-  TEST(QU8_Q31__SSE2, divide_by_po2_with_rounding_away) {
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .s(s)
-          .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_q31__sse2);
-      }
-    }
-  }
+  /* No rounding down test - it fails because of upward bias in multiplication */
+  /* No rounding away test - it fails because of upward bias in multiplication */
 
   TEST(QU8_Q31__SSE2, special_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .TestSpecialCases(xnn_qu8_requantize_q31__sse2);
   }
 
   TEST(QU8_Q31__SSE2, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .iterations(100)
       .TestRandomCasesApproximate(xnn_qu8_requantize_q31__sse2);
   }
@@ -666,6 +779,8 @@
   TEST(QU8_Q31__SSSE3, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_q31__ssse3);
     }
@@ -676,6 +791,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestExactDivideByPO2(xnn_qu8_requantize_q31__ssse3);
       }
@@ -687,32 +804,28 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_q31__ssse3);
       }
     }
   }
 
-  /* No rounding down Test - it fails because of upward bias in multiplication */
-
-  TEST(QU8_Q31__SSSE3, divide_by_po2_with_rounding_away) {
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .s(s)
-          .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_q31__ssse3);
-      }
-    }
-  }
+  /* No rounding down test - it fails because of upward bias in multiplication */
+  /* No rounding away test - it fails because of upward bias in multiplication */
 
   TEST(QU8_Q31__SSSE3, special_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .TestSpecialCases(xnn_qu8_requantize_q31__ssse3);
   }
 
   TEST(QU8_Q31__SSSE3, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .iterations(100)
       .TestRandomCasesApproximate(xnn_qu8_requantize_q31__ssse3);
   }
@@ -725,6 +838,8 @@
   TEST(QU8_Q31__SSE4, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_q31__sse4);
     }
@@ -735,6 +850,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestExactDivideByPO2(xnn_qu8_requantize_q31__sse4);
       }
@@ -746,32 +863,28 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_q31__sse4);
       }
     }
   }
 
-  /* No rounding down Test - it fails because of upward bias in multiplication */
-
-  TEST(QU8_Q31__SSE4, divide_by_po2_with_rounding_away) {
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .s(s)
-          .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_q31__sse4);
-      }
-    }
-  }
+  /* No rounding down test - it fails because of upward bias in multiplication */
+  /* No rounding away test - it fails because of upward bias in multiplication */
 
   TEST(QU8_Q31__SSE4, special_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .TestSpecialCases(xnn_qu8_requantize_q31__sse4);
   }
 
   TEST(QU8_Q31__SSE4, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .iterations(100)
       .TestRandomCasesApproximate(xnn_qu8_requantize_q31__sse4);
   }
@@ -785,6 +898,8 @@
   TEST(QU8_PRECISE__NEON, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_precise__neon);
     }
@@ -795,6 +910,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestExactDivideByPO2(xnn_qu8_requantize_precise__neon);
       }
@@ -806,6 +923,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__neon);
       }
@@ -817,6 +936,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__neon);
       }
@@ -828,6 +949,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__neon);
       }
@@ -836,11 +959,16 @@
 
   TEST(QU8_PRECISE__NEON, special_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .TestSpecialCases(xnn_qu8_requantize_precise__neon);
   }
 
   TEST(QU8_PRECISE__NEON, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
+      .zero_point(128)
       .iterations(100)
       .TestRandomCasesPrecise(xnn_qu8_requantize_precise__neon);
   }
@@ -852,6 +980,8 @@
 
   TEST(QU8_FP32__NEON, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .iterations(1000)
       .TestRandomCasesApproximate(xnn_qu8_requantize_fp32__neon);
   }
@@ -864,6 +994,8 @@
   TEST(QU8_Q31__NEON, exact_divide_by_po2) {
     for (uint32_t s = 1; s < 32; s++) {
       RequantizationTester()
+        .qmin(std::numeric_limits<uint8_t>::min())
+        .qmax(std::numeric_limits<uint8_t>::max())
         .s(s)
         .TestExactDivideByPO2(xnn_qu8_requantize_q31__neon);
     }
@@ -874,6 +1006,8 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestExactDivideByPO2(xnn_qu8_requantize_q31__neon);
       }
@@ -885,32 +1019,28 @@
       for (uint32_t s = 1; s < 32; s++) {
         RequantizationTester()
           .zero_point(zero_point)
+          .qmin(std::numeric_limits<uint8_t>::min())
+          .qmax(std::numeric_limits<uint8_t>::max())
           .s(s)
           .TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_q31__neon);
       }
     }
   }
 
-  /* No rounding down Test - it fails because of upward bias in multiplication */
-
-  TEST(QU8_Q31__NEON, divide_by_po2_with_rounding_away) {
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
-      for (uint32_t s = 1; s < 32; s++) {
-        RequantizationTester()
-          .zero_point(zero_point)
-          .s(s)
-          .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_q31__neon);
-      }
-    }
-  }
+  /* No rounding down test - it fails because of upward bias in multiplication */
+  /* No rounding away test - it fails because of upward bias in multiplication */
 
   TEST(QU8_Q31__NEON, special_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .TestSpecialCases(xnn_qu8_requantize_q31__neon);
   }
 
   TEST(QU8_Q31__NEON, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .iterations(100)
       .TestRandomCasesApproximate(xnn_qu8_requantize_q31__neon);
   }
@@ -923,6 +1053,8 @@
 
   TEST(QU8_FP32__WASMSIMD, random_cases) {
     RequantizationTester()
+      .qmin(std::numeric_limits<uint8_t>::min())
+      .qmax(std::numeric_limits<uint8_t>::max())
       .iterations(1000)
       .TestRandomCasesApproximate(xnn_qu8_requantize_fp32__wasmsimd);
   }

diff --git a/test/requantization-tester.h b/test/requantization-tester.h
index 8e34ac3..766dad7 100644
--- a/test/requantization-tester.h
+++ b/test/requantization-tester.h

@@ -22,7 +22,7 @@
 
 #include <xnnpack/params.h>
 #include <xnnpack/requantization-stubs.h>
-#include <xnnpack/scalar-utils.h>
+#include <xnnpack/requantization.h>
 
 
 class RequantizationTester {
@@ -49,21 +49,21 @@
     return this->zero_point_;
   }
 
-  inline RequantizationTester& qmin(uint8_t qmin) {
+  inline RequantizationTester& qmin(int16_t qmin) {
     this->qmin_ = qmin;
     return *this;
   }
 
-  inline uint8_t qmin() const {
+  inline int16_t qmin() const {
     return this->qmin_;
   }
 
-  inline RequantizationTester& qmax(uint8_t qmax) {
+  inline RequantizationTester& qmax(int16_t qmax) {
     this->qmax_ = qmax;
     return *this;
   }
 
-  inline uint8_t qmax() const {
+  inline int16_t qmax() const {
     return this->qmax_;
   }
 
@@ -84,8 +84,13 @@
    * produces exactly i, provided that ((i - zero point) * 2**s) does not overflow.
    */
   void TestExactDivideByPO2(xnn_qu8_requantization_function requantize) const {
-    ASSERT_GE(zero_point(), 0);
-    ASSERT_LE(zero_point(), 255);
+    ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+    ASSERT_LT(qmin(), qmax());
 
     /* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
     ASSERT_GE(s(), 1);
@@ -93,20 +98,61 @@
 
     std::vector<int32_t> inputs(256);
     std::vector<uint8_t> outputs(inputs.size());
-    const int32_t maxI = (uint32_t(std::numeric_limits<int32_t>::max()) >> s()) + zero_point();
-    const int32_t minI = -(-uint32_t(std::numeric_limits<int32_t>::min()) >> s()) + zero_point();
-    for (int32_t i = 0; i < 256; i++) {
-      const int32_t clampedI = std::max(minI, std::min(maxI, i));
-      inputs[i] = int32_t(uint32_t(clampedI - zero_point()) << s());
+    const int32_t max_i = (uint32_t(std::numeric_limits<int32_t>::max()) >> s()) + zero_point();
+    const int32_t min_i = -(-uint32_t(std::numeric_limits<int32_t>::min()) >> s()) + zero_point();
+    for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
+      const int32_t clamped_i = std::max(min_i, std::min(max_i, i));
+      inputs[i] = int32_t(uint32_t(clamped_i - zero_point()) << s());
     }
     requantize(inputs.size(), inputs.data(),
         scale(), zero_point(), qmin(), qmax(),
         outputs.data());
-    for (int32_t i = 0; i < 256; i++) {
-      const int32_t clampedI = std::max(minI, std::min(maxI, i));
-      ASSERT_EQ(clampedI, outputs[i]) << "i = " << i << ", clamped i = " << clampedI <<
-        ", min i = " << minI << ", max i = " << maxI <<
-        ", s = " << s() << ", zero point = " << zero_point();
+    for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
+      const int32_t clamped_i = std::max(min_i, std::min(max_i, i));
+      ASSERT_EQ(uint32_t(clamped_i), uint32_t(outputs[i]))
+        << "i = " << i << ", clamped i = " << clamped_i
+        << ", min i = " << min_i << ", max i = " << max_i
+        << ", s = " << s() << ", zero point = " << zero_point();
+    }
+  }
+
+  /*
+   * Test that requantization of numbers ((i - zero point) * 2**s) with
+   * - scale = exp2(-s)
+   * - zero point in [-128, 127]
+   * - no output clamping
+   * produces exactly i, provided that ((i - zero point) * 2**s) does not overflow.
+   */
+  void TestExactDivideByPO2(xnn_qs8_requantization_function requantize) const {
+    ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+    ASSERT_LT(qmin(), qmax());
+
+    /* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
+    ASSERT_GE(s(), 1);
+    ASSERT_LT(s(), 32);
+
+    std::vector<int32_t> inputs(256);
+    std::vector<int8_t> outputs(inputs.size());
+    const int32_t max_i = (uint32_t(std::numeric_limits<int32_t>::max()) >> s()) + zero_point();
+    const int32_t min_i = -(-uint32_t(std::numeric_limits<int32_t>::min()) >> s()) + zero_point();
+    for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+      const int32_t clamped_i = std::max(min_i, std::min(max_i, i));
+      inputs[i - std::numeric_limits<int8_t>::min()] = int32_t(uint32_t(clamped_i - zero_point()) << s());
+    }
+    requantize(inputs.size(), inputs.data(),
+        scale(), zero_point(), qmin(), qmax(),
+        outputs.data());
+    for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+      const int32_t clamped_i = std::max(min_i, std::min(max_i, i));
+      ASSERT_EQ(clamped_i, int32_t(outputs[i - std::numeric_limits<int8_t>::min()]))
+        << "i = " << i << ", clamped i = " << clamped_i
+        << ", min i = " << min_i << ", max i = " << max_i
+        << ", s = " << s() << ", zero point = " << zero_point();
     }
   }
 
@@ -118,8 +164,13 @@
    * produces exactly i, provided that ((i - zero point) * 2**s) does not overflow.
    */
   void TestDivideByPO2WithRoundingUp(xnn_qu8_requantization_function requantize) {
-    ASSERT_GE(zero_point(), 0);
-    ASSERT_LE(zero_point(), 255);
+    ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+    ASSERT_LT(qmin(), qmax());
 
     /* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
     ASSERT_GE(s(), 1);
@@ -127,7 +178,7 @@
 
     std::vector<int32_t> inputs(256);
     std::vector<uint8_t> outputs(inputs.size());
-    for (int32_t i = 0; i < 256; i++) {
+    for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
       const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) -
         (INT64_C(1) << (s() - 1)) + (int64_t) (i <= zero_point());
       inputs[i] = int32_t(input);
@@ -135,12 +186,54 @@
     requantize(inputs.size(), inputs.data(),
         scale(), zero_point(), qmin(), qmax(),
         outputs.data());
-    for (int32_t i = 0; i < 256; i++) {
+    for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
       const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) -
         (INT64_C(1) << (s() - 1)) + (int64_t) (i <= zero_point());
       if (int32_t(input) == input) {
-        ASSERT_EQ(i, uint32_t(outputs[i])) << "i = " << i << ", input = " << input <<
-          ", s = " << s() << ", zero point = " << zero_point();
+        ASSERT_EQ(i, int32_t(outputs[i]))
+          << "i = " << i << ", input = " << input
+          << ", s = " << s() << ", zero point = " << zero_point();
+      }
+    }
+  }
+
+  /*
+   * Test that requantization of numbers (i * 2**s + sign(i - zero point) * 2**(s-1)) with
+   * - scale = exp2(-s)
+   * - zero point in [-128, 127]
+   * - no output clamping
+   * produces exactly i, provided that ((i - zero point) * 2**s) does not overflow.
+   */
+  void TestDivideByPO2WithRoundingUp(xnn_qs8_requantization_function requantize) {
+    ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+    ASSERT_LT(qmin(), qmax());
+
+    /* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
+    ASSERT_GE(s(), 1);
+    ASSERT_LT(s(), 32);
+
+    std::vector<int32_t> inputs(256);
+    std::vector<int8_t> outputs(inputs.size());
+    for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+      const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) -
+        (INT64_C(1) << (s() - 1)) + (int64_t) (i <= zero_point());
+      inputs[i - std::numeric_limits<int8_t>::min()] = int32_t(input);
+    }
+    requantize(inputs.size(), inputs.data(),
+        scale(), zero_point(), qmin(), qmax(),
+        outputs.data());
+    for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+      const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) -
+        (INT64_C(1) << (s() - 1)) + (int64_t) (i <= zero_point());
+      if (int32_t(input) == input) {
+        ASSERT_EQ(i, int32_t(outputs[i - std::numeric_limits<int8_t>::min()]))
+          << "i = " << i << ", input = " << input
+          << ", s = " << s() << ", zero point = " << zero_point();
       }
     }
   }
@@ -153,8 +246,13 @@
    * produces exactly i, provided that ((i - zero point) * 2**s) does not overflow.
    */
   void TestDivideByPO2WithRoundingDown(xnn_qu8_requantization_function requantize) {
-    ASSERT_GE(zero_point(), 0);
-    ASSERT_LE(zero_point(), 255);
+    ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+    ASSERT_LT(qmin(), qmax());
 
     /* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
     ASSERT_GE(s(), 1);
@@ -162,7 +260,7 @@
 
     std::vector<int32_t> inputs(256);
     std::vector<uint8_t> outputs(inputs.size());
-    for (int32_t i = 0; i < 256; i++) {
+    for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
       const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) +
         (INT64_C(1) << (s() - 1)) - (int64_t) (i >= zero_point());
       inputs[i] = int32_t(input);
@@ -170,19 +268,66 @@
     requantize(inputs.size(), inputs.data(),
         scale(), zero_point(), qmin(), qmax(),
         outputs.data());
-    for (int32_t i = 0; i < 256; i++) {
+    for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
       const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) +
         (INT64_C(1) << (s() - 1)) - (int64_t) (i >= zero_point());
       if (int32_t(input) == input) {
-        ASSERT_EQ(i, uint32_t(outputs[i])) << "i = " << i << ", input = " << input <<
-          ", s = " << s() << ", zero point = " << zero_point();
+        ASSERT_EQ(i, int32_t(outputs[i]))
+          << "i = " << i << ", input = " << input
+          << ", s = " << s() << ", zero point = " << zero_point();
+      }
+    }
+  }
+
+  /*
+   * Test that requantization of numbers (i * 2**s + sign(i - zero point) * 2**(s-1)) with
+   * - scale = exp2(-s)
+   * - zero point in [-128, 127]
+   * - no output clamping
+   * produces exactly i, provided that ((i - zero point) * 2**s) does not overflow.
+   */
+  void TestDivideByPO2WithRoundingDown(xnn_qs8_requantization_function requantize) {
+    ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+    ASSERT_LT(qmin(), qmax());
+
+    /* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
+    ASSERT_GE(s(), 1);
+    ASSERT_LT(s(), 32);
+
+    std::vector<int32_t> inputs(256);
+    std::vector<int8_t> outputs(inputs.size());
+    for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+      const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) +
+        (INT64_C(1) << (s() - 1)) - (int64_t) (i >= zero_point());
+      inputs[i - std::numeric_limits<int8_t>::min()] = int32_t(input);
+    }
+    requantize(inputs.size(), inputs.data(),
+        scale(), zero_point(), qmin(), qmax(),
+        outputs.data());
+    for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+      const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) +
+        (INT64_C(1) << (s() - 1)) - (int64_t) (i >= zero_point());
+      if (int32_t(input) == input) {
+        ASSERT_EQ(i, int32_t(outputs[i - std::numeric_limits<int8_t>::min()]))
+          << "i = " << i << ", input = " << input
+          << ", s = " << s() << ", zero point = " << zero_point();
       }
     }
   }
 
   void TestDivideByPO2WithRoundingAway(xnn_qu8_requantization_function requantize) {
-    ASSERT_GE(zero_point(), 0);
-    ASSERT_LE(zero_point(), 255);
+    ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+    ASSERT_LT(qmin(), qmax());
 
     /* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
     ASSERT_GE(s(), 1);
@@ -190,7 +335,7 @@
 
     std::vector<int32_t> inputs(256);
     std::vector<uint8_t> outputs(inputs.size());
-    for (int32_t i = 0; i < 256; i++) {
+    for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
       int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s());
       if (input > 0) {
         input -= INT64_C(1) << (s() - 1);
@@ -202,7 +347,7 @@
     requantize(inputs.size(), inputs.data(),
         scale(), zero_point(), qmin(), qmax(),
         outputs.data());
-    for (uint32_t i = 0; i < 256; i++) {
+    for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
       int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s());
       if (input > 0) {
         input -= INT64_C(1) << (s() - 1);
@@ -210,18 +355,67 @@
         input += INT64_C(1) << (s() - 1);
       }
       if (int32_t(input) == input) {
-        ASSERT_EQ(i, uint32_t(outputs[i])) << "i = " << i << ", input = " << input <<
-          ", s = " << s() << ", zero point = " << zero_point();
+        ASSERT_EQ(i, int32_t(outputs[i]))
+          << "i = " << i << ", input = " << input
+          << ", s = " << s() << ", zero point = " << zero_point();
+      }
+    }
+  }
+
+  void TestDivideByPO2WithRoundingAway(xnn_qs8_requantization_function requantize) {
+    ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+    ASSERT_LT(qmin(), qmax());
+
+    /* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
+    ASSERT_GE(s(), 1);
+    ASSERT_LT(s(), 32);
+
+    std::vector<int32_t> inputs(256);
+    std::vector<int8_t> outputs(inputs.size());
+    for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+      int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s());
+      if (input > 0) {
+        input -= INT64_C(1) << (s() - 1);
+      } else if (input < 0) {
+        input += INT64_C(1) << (s() - 1);
+      }
+      inputs[i - std::numeric_limits<int8_t>::min()] = int32_t(input);
+    }
+    requantize(inputs.size(), inputs.data(),
+        scale(), zero_point(), qmin(), qmax(),
+        outputs.data());
+    for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+      int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s());
+      if (input > 0) {
+        input -= INT64_C(1) << (s() - 1);
+      } else if (input < 0) {
+        input += INT64_C(1) << (s() - 1);
+      }
+      if (int32_t(input) == input) {
+        ASSERT_EQ(i, int32_t(outputs[i - std::numeric_limits<int8_t>::min()]))
+          << "i = " << i << ", input = " << input
+          << ", s = " << s() << ", zero point = " << zero_point();
       }
     }
   }
 
   void TestSpecialCases(xnn_qu8_requantization_function requantize) {
+    ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+    ASSERT_LT(qmin(), qmax());
+
     std::vector<int32_t> inputs(256);
     std::vector<uint8_t> outputs(inputs.size());
 
     std::fill(inputs.begin(), inputs.end(), std::numeric_limits<int32_t>::min());
-    for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
+    for (int32_t zero_point = 0; zero_point <= std::numeric_limits<uint8_t>::max(); zero_point++) {
       requantize(
           inputs.size(),
           inputs.data(),
@@ -230,7 +424,9 @@
           std::numeric_limits<uint8_t>::min(),
           std::numeric_limits<uint8_t>::max(),
           outputs.data());
-      ASSERT_EQ(std::max(int32_t(0), zero_point - 1), *std::min_element(outputs.cbegin(), outputs.cend()));
+      for (size_t i = 0; i < outputs.size(); i++) {
+        ASSERT_EQ(std::max(int32_t(int32_t(std::numeric_limits<uint8_t>::min())), zero_point - 1), int32_t(outputs[i]));
+      }
     }
 
     std::fill(inputs.begin(), inputs.end(), std::numeric_limits<int32_t>::max());
@@ -242,73 +438,167 @@
         std::numeric_limits<uint8_t>::min(),
         std::numeric_limits<uint8_t>::max(),
         outputs.data());
-    for (size_t i = 0; i < inputs.size(); i++) {
-      ASSERT_EQ(std::numeric_limits<uint8_t>::max(), outputs[i]);
+    for (size_t i = 0; i < outputs.size(); i++) {
+      ASSERT_EQ(std::numeric_limits<uint8_t>::max(), int32_t(outputs[i]));
+    }
+  }
+
+  void TestSpecialCases(xnn_qs8_requantization_function requantize) {
+    ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+    ASSERT_LT(qmin(), qmax());
+
+    std::vector<int32_t> inputs(256);
+    std::vector<int8_t> outputs(inputs.size());
+
+    std::fill(inputs.begin(), inputs.end(), std::numeric_limits<int32_t>::min());
+    for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+         zero_point <= std::numeric_limits<int8_t>::max();
+         zero_point++)
+    {
+      requantize(
+          inputs.size(),
+          inputs.data(),
+          ldexpf(1.0f, -32) /* scale */,
+          zero_point,
+          std::numeric_limits<int8_t>::min(),
+          std::numeric_limits<int8_t>::max(),
+          outputs.data());
+      for (size_t i = 0; i < outputs.size(); i++) {
+        ASSERT_EQ(std::max(int32_t(std::numeric_limits<int8_t>::min()), zero_point - 1), int32_t(outputs[i]));
+      }
+    }
+
+    std::fill(inputs.begin(), inputs.end(), std::numeric_limits<int32_t>::max());
+    requantize(
+        inputs.size(),
+        inputs.data(),
+        0x1.FFFFFEp-1f /* scale */,
+        std::numeric_limits<int8_t>::max() /* zero point */,
+        std::numeric_limits<int8_t>::min(),
+        std::numeric_limits<int8_t>::max(),
+        outputs.data());
+    for (size_t i = 0; i < outputs.size(); i++) {
+      ASSERT_EQ(std::numeric_limits<int8_t>::max(), int32_t(outputs[i]));
     }
   }
 
   void TestRandomCasesPrecise(xnn_qu8_requantization_function requantize) {
+    ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+    ASSERT_LT(qmin(), qmax());
+
     std::random_device random_device;
     std::mt19937 rng(random_device());
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
+      auto u8rng =
+        std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
 
       std::vector<int32_t> inputs(4096);
       std::vector<uint8_t> outputs(inputs.size());
 
-      const uint8_t zero_point = UINT8_C(128);
       std::uniform_real_distribution<float> scale_distribution(0x1.000000p-23f, 0x1.FFFFFEp-1f);
       const float scale = scale_distribution(rng);
       for (size_t i = 0; i < inputs.size(); i++) {
-        const uint8_t approximate_output = u8rng();
+        const uint8_t approximate_output = std::min(std::max(uint8_t(u8rng()), uint8_t(qmin())), uint8_t(qmax()));
         const int32_t input = int32_t(double(approximate_output) / double(scale));
         inputs[i] = input;
       }
 
       requantize(
-        inputs.size(), inputs.data(), scale, zero_point,
-        std::numeric_limits<uint8_t>::min(),
-        std::numeric_limits<uint8_t>::max(),
+        inputs.size(), inputs.data(), scale, zero_point(), qmin(), qmax(),
         outputs.data());
 
-      /* Ensure that outputs are not all identical, as in this case Test doesn't validate much */
+      /* Ensure that outputs are not all identical, as in this case the test doesn't validate much */
       ASSERT_NE(
         *std::max_element(outputs.cbegin(), outputs.cend()),
         *std::min_element(outputs.cbegin(), outputs.cend()));
 
       for (size_t i = 0; i < inputs.size(); i++) {
         const uint8_t reference_output =
-          scalar_requantize_precise(
-            inputs[i], scale, zero_point,
-            std::numeric_limits<uint8_t>::min(),
-            std::numeric_limits<uint8_t>::max());
+          xnn_qu8_requantize_precise(inputs[i], scale, zero_point(), qmin(), qmax());
         ASSERT_EQ(uint32_t(reference_output), uint32_t(outputs[i]));
       }
     }
   }
 
-  void TestRandomCasesApproximate(xnn_qu8_requantization_function requantize) {
+  void TestRandomCasesPrecise(xnn_qs8_requantization_function requantize) {
+    ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+    ASSERT_LT(qmin(), qmax());
+
     std::random_device random_device;
     std::mt19937 rng(random_device());
     for (size_t iteration = 0; iteration < iterations(); iteration++) {
-      auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
+      auto s8rng = std::bind(
+        std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));
 
       std::vector<int32_t> inputs(4096);
-      std::vector<uint8_t> outputs(inputs.size());
+      std::vector<int8_t> outputs(inputs.size());
 
-      const uint8_t zero_point = UINT8_C(128);
       std::uniform_real_distribution<float> scale_distribution(0x1.000000p-23f, 0x1.FFFFFEp-1f);
       const float scale = scale_distribution(rng);
       for (size_t i = 0; i < inputs.size(); i++) {
-        const uint8_t approximate_output = u8rng();
+        const int8_t approximate_output = std::min(std::max(int8_t(s8rng()), int8_t(qmin())), int8_t(qmax()));
         const int32_t input = int32_t(double(approximate_output) / double(scale));
         inputs[i] = input;
       }
 
       requantize(
-        inputs.size(), inputs.data(), scale, zero_point,
-        std::numeric_limits<uint8_t>::min(),
-        std::numeric_limits<uint8_t>::max(),
+        inputs.size(), inputs.data(), scale, zero_point(), qmin(), qmax(),
+        outputs.data());
+
+      /* Ensure that outputs are not all identical, as in this case the test doesn't validate much */
+      ASSERT_NE(
+        *std::max_element(outputs.cbegin(), outputs.cend()),
+        *std::min_element(outputs.cbegin(), outputs.cend()));
+
+      for (size_t i = 0; i < inputs.size(); i++) {
+        const int8_t reference_output =
+          xnn_qs8_requantize_precise(inputs[i], scale, zero_point(), qmin(), qmax());
+        ASSERT_EQ(int32_t(reference_output), int32_t(outputs[i]));
+      }
+    }
+  }
+
+  void TestRandomCasesApproximate(xnn_qu8_requantization_function requantize) {
+    ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+    ASSERT_LT(qmin(), qmax());
+
+    std::random_device random_device;
+    std::mt19937 rng(random_device());
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      auto u8rng =
+        std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
+
+      std::vector<int32_t> inputs(4096);
+      std::vector<uint8_t> outputs(inputs.size());
+
+      std::uniform_real_distribution<float> scale_distribution(0x1.000000p-23f, 0x1.FFFFFEp-1f);
+      const float scale = scale_distribution(rng);
+      for (size_t i = 0; i < inputs.size(); i++) {
+        const uint8_t approximate_output = std::min(std::max(uint8_t(u8rng()), uint8_t(qmin())), uint8_t(qmax()));
+        const int32_t input = int32_t(double(approximate_output) / double(scale));
+        inputs[i] = input;
+      }
+
+      requantize(
+        inputs.size(), inputs.data(), scale, zero_point(), qmin(), qmax(),
         outputs.data());
 
       /* Ensure that outputs are not all identical, as in this case Test doesn't validate much */
@@ -317,14 +607,56 @@
         *std::min_element(outputs.cbegin(), outputs.cend()));
 
       for (size_t i = 0; i < inputs.size(); i++) {
-        const double reference_output =
-          RequantizationTester::RequantizeApproximate(
-            inputs[i], scale, zero_point,
-            std::numeric_limits<uint8_t>::min(),
-            std::numeric_limits<uint8_t>::max());
-        ASSERT_LE(fabs(reference_output - double(outputs[i])), 0.55) <<
-          "input = " << inputs[i] <<
-          ", output = " << uint32_t(outputs[i]) << ", reference output = " << reference_output;
+        const double reference_output = RequantizationTester::RequantizeApproximate(
+          inputs[i], scale, uint8_t(zero_point()), uint8_t(qmin()), uint8_t(qmax()));
+        ASSERT_LE(std::abs(reference_output - double(outputs[i])), 0.55)
+          << "input = " << inputs[i] << ", output = " << int32_t(outputs[i])
+          << ", reference output = " << reference_output;
+      }
+    }
+  }
+
+  void TestRandomCasesApproximate(xnn_qs8_requantization_function requantize) {
+    ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+    ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+    ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+    ASSERT_LT(qmin(), qmax());
+
+    std::random_device random_device;
+    std::mt19937 rng(random_device());
+    for (size_t iteration = 0; iteration < iterations(); iteration++) {
+      auto s8rng = std::bind(
+        std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));
+
+      std::vector<int32_t> inputs(4096);
+      std::vector<int8_t> outputs(inputs.size());
+
+      std::uniform_real_distribution<float> scale_distribution(0x1.000000p-23f, 0x1.FFFFFEp-1f);
+      const float scale = scale_distribution(rng);
+      for (size_t i = 0; i < inputs.size(); i++) {
+        const int8_t approximate_output = std::min(std::max(int8_t(s8rng()), int8_t(qmin())), int8_t(qmax()));
+        const int32_t input = int32_t(double(approximate_output) / double(scale));
+        inputs[i] = input;
+      }
+
+      requantize(
+        inputs.size(), inputs.data(), scale, zero_point(), qmin(), qmax(),
+        outputs.data());
+
+      /* Ensure that outputs are not all identical, as in this case Test doesn't validate much */
+      ASSERT_NE(
+        *std::max_element(outputs.cbegin(), outputs.cend()),
+        *std::min_element(outputs.cbegin(), outputs.cend()));
+
+      for (size_t i = 0; i < inputs.size(); i++) {
+        const double reference_output = RequantizationTester::RequantizeApproximate(
+          inputs[i], scale, int8_t(zero_point()), int8_t(qmin()), int8_t(qmax()));
+        ASSERT_LE(std::abs(reference_output - double(outputs[i])), 0.55)
+          << "input = " << inputs[i] << ", output = " << int32_t(outputs[i])
+          << ", reference output = " << reference_output;
       }
     }
   }
@@ -343,25 +675,26 @@
     assert(scale < 1.0f);
     assert(scale >= 0x1.0p-32f);
 
-    double clamped_value = double(value) * double(scale) + double(zero_point);
+    return std::min(std::max(double(value) * double(scale) + double(zero_point), double(qmin)), double(qmax));
+  }
 
-    const double fmin = double(qmin);
-    if (clamped_value < fmin) {
-      clamped_value = fmin;
-    }
+  static inline double RequantizeApproximate(
+    int32_t value,
+    float scale,
+    int8_t zero_point,
+    int8_t qmin,
+    int8_t qmax)
+  {
+    assert(scale < 1.0f);
+    assert(scale >= 0x1.0p-32f);
 
-    const double fmax = double(qmax);
-    if (clamped_value > fmax) {
-      clamped_value = fmax;
-    }
-
-    return clamped_value;
+    return std::min(std::max(double(value) * double(scale) + double(zero_point), double(qmin)), double(qmax));
   }
 
  private:
-  size_t zero_point_{0};
-  size_t s_{1};
-  uint8_t qmin_{std::numeric_limits<uint8_t>::min()};
-  uint8_t qmax_{std::numeric_limits<uint8_t>::max()};
+  uint32_t s_{1};
+  int32_t zero_point_{0};
+  int16_t qmin_{std::numeric_limits<int16_t>::min()};
+  int16_t qmax_{std::numeric_limits<int16_t>::max()};
   size_t iterations_{1};
 };
commit	2e23d2b0a9d7c2ced3fc727f05f48fbc765ff3c4	[log] [tgz]
author	Marat Dukhan <maratek@google.com>	Wed Jul 29 16:01:37 2020 -0700
committer	XNNPACK Team <xnnpack-github-robot@google.com>	Wed Jul 29 16:02:28 2020 -0700
tree	60f5238cbff9c00beb31e797029a3839ea6b21ac
parent	c5045bfda469c2d322a726ecf242a25c365ae39d [diff]