Signed requantization evaluation stubs and unit tests
PiperOrigin-RevId: 323887449
diff --git a/BUILD.bazel b/BUILD.bazel
index f98fb51..8ffbba8 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -358,6 +358,12 @@
"src/math/sigmoid-scalar-lut2048-p1-div.c",
"src/math/sigmoid-scalar-lut64-p2-div.c",
"src/math/sigmoid-scalar-p5-div.c",
+ "src/qs8-requantization/fp32-scalar-lrintf.c",
+ "src/qs8-requantization/fp32-scalar-magic.c",
+ "src/qs8-requantization/precise-scalar-signed64.c",
+ "src/qs8-requantization/precise-scalar-unsigned32.c",
+ "src/qs8-requantization/precise-scalar-unsigned64.c",
+ "src/qs8-requantization/q31-scalar.c",
"src/qu8-avgpool/9p8x-minmax-scalar-c1.c",
"src/qu8-avgpool/9x-minmax-scalar-c1.c",
"src/qu8-dwconv/up1x9-minmax-scalar.c",
@@ -842,6 +848,7 @@
"src/f32-vunary/gen/vneg-wasmsimd-x8.c",
"src/f32-vunary/gen/vsqr-wasmsimd-x4.c",
"src/f32-vunary/gen/vsqr-wasmsimd-x8.c",
+ "src/qs8-requantization/fp32-wasmsimd.c",
"src/qu8-requantization/fp32-wasmsimd.c",
"src/x32-fill/wasmsimd.c",
"src/x32-packx/x4-wasmsimd.c",
@@ -874,6 +881,8 @@
]
PSIMD_ACCMATH_UKERNELS = [
+ "src/qs8-requantization/precise-psimd.c",
+ "src/qs8-requantization/fp32-psimd.c",
"src/qu8-requantization/precise-psimd.c",
"src/qu8-requantization/fp32-psimd.c",
]
@@ -1066,6 +1075,9 @@
"src/f32-vunary/gen/vneg-neon-x8.c",
"src/f32-vunary/gen/vsqr-neon-x4.c",
"src/f32-vunary/gen/vsqr-neon-x8.c",
+ "src/qs8-requantization/precise-neon.c",
+ "src/qs8-requantization/fp32-neon.c",
+ "src/qs8-requantization/q31-neon.c",
"src/qu8-avgpool/9p8x-minmax-neon-c8.c",
"src/qu8-avgpool/9x-minmax-neon-c8.c",
"src/qu8-dwconv/up8x9-minmax-neon.c",
@@ -1625,6 +1637,9 @@
"src/f32-vrnd/gen/vrndu-sse2-x8.c",
"src/f32-vrnd/gen/vrndd-sse2-x4.c",
"src/f32-vrnd/gen/vrndd-sse2-x8.c",
+ "src/qs8-requantization/precise-sse2.c",
+ "src/qs8-requantization/fp32-sse2.c",
+ "src/qs8-requantization/q31-sse2.c",
"src/qu8-avgpool/9p8x-minmax-sse2-c8.c",
"src/qu8-avgpool/9x-minmax-sse2-c8.c",
"src/qu8-igemm/4x4c2-minmax-sse2.c",
@@ -1659,6 +1674,8 @@
]
SSSE3_UKERNELS = [
+ "src/qs8-requantization/precise-ssse3.c",
+ "src/qs8-requantization/q31-ssse3.c",
"src/qu8-requantization/precise-ssse3.c",
"src/qu8-requantization/q31-ssse3.c",
]
@@ -1682,6 +1699,9 @@
"src/f32-vrnd/gen/vrndu-sse41-x8.c",
"src/f32-vrnd/gen/vrndd-sse41-x4.c",
"src/f32-vrnd/gen/vrndd-sse41-x8.c",
+ "src/qs8-requantization/fp32-sse4.c",
+ "src/qs8-requantization/precise-sse4.c",
+ "src/qs8-requantization/q31-sse4.c",
"src/qu8-requantization/precise-sse4.c",
"src/qu8-requantization/q31-sse4.c",
"src/math/roundne-sse41.c",
@@ -4971,6 +4991,16 @@
)
xnnpack_unit_test(
+ name = "qs8_requantization_test",
+ srcs = [
+ "src/xnnpack/requantization-stubs.h",
+ "test/qs8-requantization.cc",
+ "test/requantization-tester.h",
+ ] + MICROKERNEL_TEST_HDRS,
+ deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
name = "qu8_avgpool_minmax_test",
srcs = [
"test/qu8-avgpool-minmax.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3bb9cef..53a4f9f 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -481,6 +481,12 @@
src/math/sigmoid-scalar-lut2048-p1-div.c
src/math/sigmoid-scalar-lut64-p2-div.c
src/math/sigmoid-scalar-p5-div.c
+ src/qs8-requantization/fp32-scalar-lrintf.c
+ src/qs8-requantization/fp32-scalar-magic.c
+ src/qs8-requantization/precise-scalar-signed64.c
+ src/qs8-requantization/precise-scalar-unsigned32.c
+ src/qs8-requantization/precise-scalar-unsigned64.c
+ src/qs8-requantization/q31-scalar.c
src/qu8-avgpool/9p8x-minmax-scalar-c1.c
src/qu8-avgpool/9x-minmax-scalar-c1.c
src/qu8-dwconv/up1x9-minmax-scalar.c
@@ -528,6 +534,8 @@
src/f32-spmm/gen/16x1-minmax-psimd.c)
SET(XNNPACK_PSIMD_ACCMATH_MICROKERNEL_SRCS
+ src/qs8-requantization/fp32-psimd.c
+ src/qs8-requantization/precise-psimd.c
src/qu8-requantization/fp32-psimd.c
src/qu8-requantization/precise-psimd.c)
@@ -718,6 +726,9 @@
src/f32-vunary/gen/vneg-neon-x8.c
src/f32-vunary/gen/vsqr-neon-x4.c
src/f32-vunary/gen/vsqr-neon-x8.c
+ src/qs8-requantization/precise-neon.c
+ src/qs8-requantization/fp32-neon.c
+ src/qs8-requantization/q31-neon.c
src/qu8-avgpool/9p8x-minmax-neon-c8.c
src/qu8-avgpool/9x-minmax-neon-c8.c
src/qu8-dwconv/up8x9-minmax-neon.c
@@ -1271,6 +1282,9 @@
src/f32-vrnd/gen/vrndu-sse2-x8.c
src/f32-vrnd/gen/vrndd-sse2-x4.c
src/f32-vrnd/gen/vrndd-sse2-x8.c
+ src/qs8-requantization/fp32-sse2.c
+ src/qs8-requantization/precise-sse2.c
+ src/qs8-requantization/q31-sse2.c
src/qu8-avgpool/9p8x-minmax-sse2-c8.c
src/qu8-avgpool/9x-minmax-sse2-c8.c
src/qu8-igemm/4x4c2-minmax-sse2.c
@@ -1304,6 +1318,8 @@
src/math/sigmoid-sse2-p5-div.c)
SET(XNNPACK_SSSE3_MICROKERNEL_SRCS
+ src/qs8-requantization/precise-ssse3.c
+ src/qs8-requantization/q31-ssse3.c
src/qu8-requantization/precise-ssse3.c
src/qu8-requantization/q31-ssse3.c)
@@ -1326,6 +1342,9 @@
src/f32-vrnd/gen/vrndu-sse41-x8.c
src/f32-vrnd/gen/vrndd-sse41-x4.c
src/f32-vrnd/gen/vrndd-sse41-x8.c
+ src/qs8-requantization/fp32-sse4.c
+ src/qs8-requantization/precise-sse4.c
+ src/qs8-requantization/q31-sse4.c
src/qu8-requantization/precise-sse4.c
src/qu8-requantization/q31-sse4.c
src/math/roundne-sse41.c
diff --git a/src/qs8-requantization/fp32-neon.c b/src/qs8-requantization/fp32-neon.c
new file mode 100644
index 0000000..448cf25
--- /dev/null
+++ b/src/qs8-requantization/fp32-neon.c
@@ -0,0 +1,134 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__neon(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const float32x4_t vscale = vdupq_n_f32(scale);
+#ifdef __aarch64__
+ const int16x8_t vzero_point = vdupq_n_s16((int16_t) zero_point);
+ const int8x16_t vqmin = vdupq_n_s8(qmin);
+ const int8x16_t vqmax = vdupq_n_s8(qmax);
+#else
+ const float32x4_t vfmin = vdupq_n_f32((float) ((int32_t) qmin - (int32_t) zero_point));
+ const float32x4_t vfmax = vdupq_n_f32((float) ((int32_t) qmax - (int32_t) zero_point));
+ const float32x4_t vfmagic = vdupq_n_f32(12582912.0f);
+ const int32x4_t vimagic = vdupq_n_s32(INT32_C(0x4B400000) - (int32_t) zero_point);
+#endif
+ for (; n != 0; n -= 16) {
+ const int32x4_t x = vld1q_s32(input);
+ const int32x4_t y = vld1q_s32(input + 4);
+ const int32x4_t z = vld1q_s32(input + 8);
+ const int32x4_t w = vld1q_s32(input + 12);
+ input += 16;
+
+ // Convert int32_t input to FP32 and multiply by FP32 scale.
+ // Both operations involve statistically unbiased roundings:
+ // - Large int32_t values can't be exactly represented as FP32. The conversion instruction in ARM NEON would
+ // round it to nearest FP32 value with ties to even.
+ // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+ // to nearest FP32 value with ties to even.
+ const float32x4_t x_scaled = vmulq_f32(vcvtq_f32_s32(x), vscale);
+ const float32x4_t y_scaled = vmulq_f32(vcvtq_f32_s32(y), vscale);
+ const float32x4_t z_scaled = vmulq_f32(vcvtq_f32_s32(z), vscale);
+ const float32x4_t w_scaled = vmulq_f32(vcvtq_f32_s32(w), vscale);
+
+#ifdef __aarch64__
+ // Leverage "Floating-point Convert to Signed integer, rouding to nearest with ties to even" instruction.
+ // This is an ARMv8 instruction (always available in AArch64), which saturates result on overflow.
+ // We don't need to specifically consider saturated results, they will be clamped at the last stage.
+ const int32x4_t x_rounded = vcvtnq_s32_f32(x_scaled);
+ const int32x4_t y_rounded = vcvtnq_s32_f32(y_scaled);
+ const int32x4_t z_rounded = vcvtnq_s32_f32(z_scaled);
+ const int32x4_t w_rounded = vcvtnq_s32_f32(w_scaled);
+
+ // Standard final sequence on ARM NEON:
+ // - Pack to int16_t and saturate
+ // - Add zero point
+ // - Pack to uint8_t and saturate
+ // - Clamp between qmin and qmax
+ const int16x8_t xy_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(x_rounded), y_rounded), vzero_point);
+ const int16x8_t zw_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(z_rounded), w_rounded), vzero_point);
+ const int8x16_t xyzw_packed = vqmovn_high_s16(vqmovn_s16(xy_packed), zw_packed);
+ const int8x16_t xyzw_clamped = vmaxq_s8(vminq_s8(xyzw_packed, vqmax), vqmin);
+
+ // AArch32 version:
+ // 4x VCVT.F32.S32 Qd, Qm
+ // 4x VMUL.F32 Qd, Qm, Qn
+ // 4x VMIN.F32 Qd, Qm, Qn
+ // 4x VMAX.F32 Qd, Qm, Qn
+ // 4x VADD.F32 Qd, Qm, Qn
+ // 4x VSUB.S32 Qd, Qm, Qn
+ // 4x VMOVN.I32 Dd, Qm
+ // 2x VMOVN.I16 Dd, Qm
+ // ---------------------
+ // 30 instructions total
+ vst1q_s8(output, xyzw_clamped); output += 16;
+#else
+ // AArch64 version:
+ // 4x SCVTF Vd.4S, Vn.4S
+ // 4x FMUL Vd.4S, Vn.4S, Vm.4S
+ // 4x FCVTNS Vd.4S, Vn.4S
+ // 2x SQXTN Vd.4H, Vn.4S
+ // 2x SQXTN2 Vd.8H, Vn.4S
+ // 2x SQADD Vd.8H, Vn.8H, Vm.8H
+ // 1x SQXTN Vd.8B, Vn.8H
+ // 1x SQXTN2 Vd.16B, Vn.8H
+ // 1x SMIN Vd.16B, Vn.16B, Vm.16B
+ // 1x SMAX Vd.16B, Vn.16B, Vm.16B
+ // ---------------------
+ // 22 instructions total
+
+ // ARMv7 NEON offers only a floating-point to integer conversion instruction with rounding towards zero.
+ // In lieu of conversion instruction with rounding-to-nearest-even, we use a magic trick of adding a large
+ // number (1.5 * 2**23) to scaled value to cause rounding to integer, and then substracing this magic number as
+ // integer. This trick works only in a limited range (absolute value of input must be less than 2**22), so
+ // generally we have to clamp input to this range before using the magic. However, clamping to any smaller range
+ // works just as well, and thus we clamp to [qmin - zero point, qmax - zero point] range so that after we add
+ // zero point to the result, it gets into target [qmin, qmax] range.
+ const float32x4_t x_clamped = vminq_f32(vmaxq_f32(x_scaled, vfmin), vfmax);
+ const float32x4_t y_clamped = vminq_f32(vmaxq_f32(y_scaled, vfmin), vfmax);
+ const float32x4_t z_clamped = vminq_f32(vmaxq_f32(z_scaled, vfmin), vfmax);
+ const float32x4_t w_clamped = vminq_f32(vmaxq_f32(w_scaled, vfmin), vfmax);
+
+ // Conversion to integer using the "magic trick". Rounding is performed in the output of addition operation,
+ // and result is rounded to nearest even integer with ties to even.
+ const int32x4_t x_biased = vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(x_clamped, vfmagic)), vimagic);
+ const int32x4_t y_biased = vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(y_clamped, vfmagic)), vimagic);
+ const int32x4_t z_biased = vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(z_clamped, vfmagic)), vimagic);
+ const int32x4_t w_biased = vsubq_s32(vreinterpretq_s32_f32(vaddq_f32(w_clamped, vfmagic)), vimagic);
+
+ // Select low 8 bits of each 32-bit integer in the vectors for the output.
+ // Since result is already clamped to [qmin, qmax] subrange of [0, 255], saturation is not needed.
+ const int16x8_t xy_packed = vcombine_s16(vmovn_s32(x_biased), vmovn_s32(y_biased));
+ const int16x8_t zw_packed = vcombine_s16(vmovn_s32(z_biased), vmovn_s32(w_biased));
+ const int8x16_t xyzw_packed = vcombine_s8(vmovn_s16(xy_packed), vmovn_s16(zw_packed));
+
+ vst1q_s8(output, xyzw_packed); output += 16;
+#endif
+ }
+}
diff --git a/src/qs8-requantization/fp32-psimd.c b/src/qs8-requantization/fp32-psimd.c
new file mode 100644
index 0000000..21958a1
--- /dev/null
+++ b/src/qs8-requantization/fp32-psimd.c
@@ -0,0 +1,84 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <psimd.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__psimd(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const psimd_f32 vscale = psimd_splat_f32(scale);
+ const psimd_f32 vfmin = psimd_splat_f32((float) ((int32_t) qmin - (int32_t) zero_point));
+ const psimd_f32 vfmax = psimd_splat_f32((float) ((int32_t) qmax - (int32_t) zero_point));
+ const psimd_f32 vfmagic = psimd_splat_f32(12582912.0f);
+ const psimd_s32 vimagic = psimd_splat_s32(INT32_C(0x4B400000) - (int32_t) zero_point);
+ for (; n != 0; n -= 16) {
+ const psimd_s32 x = psimd_load_s32(input);
+ const psimd_s32 y = psimd_load_s32(input + 4);
+ const psimd_s32 z = psimd_load_s32(input + 8);
+ const psimd_s32 w = psimd_load_s32(input + 12);
+ input += 16;
+
+ // Convert int32_t input to FP32 and multiply by FP32 scale.
+ // Both operations involve roundings:
+ // - Large int32_t values can't be exactly represented as FP32. We expect that conversion instruction would
+ // round it to nearest FP32 value with ties to even, but Clang documentation for __builtin_convertvector does
+ // not guaratee that.
+ // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+ // to nearest FP32 value with ties to even.
+ const psimd_f32 x_scaled = psimd_cvt_s32_f32(x) * vscale;
+ const psimd_f32 y_scaled = psimd_cvt_s32_f32(y) * vscale;
+ const psimd_f32 z_scaled = psimd_cvt_s32_f32(z) * vscale;
+ const psimd_f32 w_scaled = psimd_cvt_s32_f32(w) * vscale;
+
+ // Clang/gcc vector extension does not provide an intrinsics for a floating-point to integer conversion
+ // operation with rounding-to-nearest-even. In lieu of such intrinsic, we use a magic trick of adding a large
+ // number (1.5 * 2**23) to scaled value to cause rounding to integer, and then substracing this magic number as
+ // integer. This trick works only in a limited range (absolute value of input must be less than 2**22), so
+ // generally we have to clamp input to this range before using the magic. However, clamping to any smaller range
+ // works just as well, and thus we clamp to [qmin - zero point, qmax - zero point] range so that after we add
+ // zero point to the result, it gets into target [qmin, qmax] range.
+ const psimd_f32 x_clamped = psimd_min_f32(psimd_max_f32(x_scaled, vfmin), vfmax);
+ const psimd_f32 y_clamped = psimd_min_f32(psimd_max_f32(y_scaled, vfmin), vfmax);
+ const psimd_f32 z_clamped = psimd_min_f32(psimd_max_f32(z_scaled, vfmin), vfmax);
+ const psimd_f32 w_clamped = psimd_min_f32(psimd_max_f32(w_scaled, vfmin), vfmax);
+
+ // Conversion to integer using the "magic trick". Rounding is performed in the output of addition operation,
+ // and result is rounded to nearest even integer with ties to even.
+ const psimd_s32 x_biased = (psimd_s32) (x_clamped + vfmagic) - vimagic;
+ const psimd_s32 y_biased = (psimd_s32) (y_clamped + vfmagic) - vimagic;
+ const psimd_s32 z_biased = (psimd_s32) (z_clamped + vfmagic) - vimagic;
+ const psimd_s32 w_biased = (psimd_s32) (w_clamped + vfmagic) - vimagic;
+
+ // Select low 8 bits of each 32-bit integer in the vectors for the output.
+ // Since result is already clamped to [qmin, qmax] subrange of [0, 255], saturation is not needed.
+ const psimd_s16 xy_packed = psimd_concat_even_s16((psimd_s16) x_biased, (psimd_s16) y_biased);
+ const psimd_s16 zw_packed = psimd_concat_even_s16((psimd_s16) z_biased, (psimd_s16) w_biased);
+
+ const psimd_s8 xyzw_packed = psimd_concat_even_s8((psimd_s8) xy_packed, (psimd_s8) zw_packed);
+
+ psimd_store_s8(output, xyzw_packed);
+ output += 16;
+ }
+}
diff --git a/src/qs8-requantization/fp32-scalar-lrintf.c b/src/qs8-requantization/fp32-scalar-lrintf.c
new file mode 100644
index 0000000..29937fe
--- /dev/null
+++ b/src/qs8-requantization/fp32-scalar-lrintf.c
@@ -0,0 +1,67 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__scalar_lrintf(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 4 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const long lmin = (long) ((int32_t) qmin - (int32_t) zero_point);
+ const long lmax = (long) ((int32_t) qmax - (int32_t) zero_point);
+ for (; n != 0; n -= 4) {
+ const int32_t x = input[0];
+ const int32_t y = input[1];
+ const int32_t z = input[2];
+ const int32_t w = input[3];
+ input += 4;
+
+ const float x_scaled = (float) x * scale;
+ const float y_scaled = (float) y * scale;
+ const float z_scaled = (float) z * scale;
+ const float w_scaled = (float) w * scale;
+
+ const long x_rounded = lrintf(x_scaled);
+ const long y_rounded = lrintf(y_scaled);
+ const long z_rounded = lrintf(z_scaled);
+ const long w_rounded = lrintf(w_scaled);
+
+ const int32_t x_clamped = (int32_t) (x_rounded < lmin ? lmin : x_rounded > lmax ? lmax : x_rounded);
+ const int32_t y_clamped = (int32_t) (y_rounded < lmin ? lmin : y_rounded > lmax ? lmax : y_rounded);
+ const int32_t z_clamped = (int32_t) (z_rounded < lmin ? lmin : z_rounded > lmax ? lmax : z_rounded);
+ const int32_t w_clamped = (int32_t) (w_rounded < lmin ? lmin : w_rounded > lmax ? lmax : w_rounded);
+
+ const int32_t x_biased = x_clamped + (int32_t) zero_point;
+ const int32_t y_biased = y_clamped + (int32_t) zero_point;
+ const int32_t z_biased = z_clamped + (int32_t) zero_point;
+ const int32_t w_biased = w_clamped + (int32_t) zero_point;
+
+ output[0] = (int8_t) x_biased;
+ output[1] = (int8_t) y_biased;
+ output[2] = (int8_t) z_biased;
+ output[3] = (int8_t) w_biased;
+ output += 4;
+ }
+}
diff --git a/src/qs8-requantization/fp32-scalar-magic.c b/src/qs8-requantization/fp32-scalar-magic.c
new file mode 100644
index 0000000..44aea1a
--- /dev/null
+++ b/src/qs8-requantization/fp32-scalar-magic.c
@@ -0,0 +1,64 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__scalar_magic(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 4 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const float fmin = (float) ((int32_t) qmin - (int32_t) zero_point);
+ const float fmax = (float) ((int32_t) qmax - (int32_t) zero_point);
+ const float fmagic = 12582912.0f;
+ const int32_t imagic = INT32_C(0x4B400000) - (int32_t) zero_point;
+ for (; n != 0; n -= 4) {
+ const int32_t x = input[0];
+ const int32_t y = input[1];
+ const int32_t z = input[2];
+ const int32_t w = input[3];
+ input += 4;
+
+ const float x_scaled = (float) x * scale;
+ const float y_scaled = (float) y * scale;
+ const float z_scaled = (float) z * scale;
+ const float w_scaled = (float) w * scale;
+
+ const float x_clamped = x_scaled < fmin ? fmin : x_scaled > fmax ? fmax : x_scaled;
+ const float y_clamped = y_scaled < fmin ? fmin : y_scaled > fmax ? fmax : y_scaled;
+ const float z_clamped = z_scaled < fmin ? fmin : z_scaled > fmax ? fmax : z_scaled;
+ const float w_clamped = w_scaled < fmin ? fmin : w_scaled > fmax ? fmax : w_scaled;
+
+ const int32_t x_biased = (int32_t) fp32_to_bits(x_clamped + fmagic) - imagic;
+ const int32_t y_biased = (int32_t) fp32_to_bits(y_clamped + fmagic) - imagic;
+ const int32_t z_biased = (int32_t) fp32_to_bits(z_clamped + fmagic) - imagic;
+ const int32_t w_biased = (int32_t) fp32_to_bits(w_clamped + fmagic) - imagic;
+
+ output[0] = (int8_t) x_biased;
+ output[1] = (int8_t) y_biased;
+ output[2] = (int8_t) z_biased;
+ output[3] = (int8_t) w_biased;
+ output += 4;
+ }
+}
diff --git a/src/qs8-requantization/fp32-sse2.c b/src/qs8-requantization/fp32-sse2.c
new file mode 100644
index 0000000..b7518c9
--- /dev/null
+++ b/src/qs8-requantization/fp32-sse2.c
@@ -0,0 +1,94 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__sse2(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const __m128 vscale = _mm_set1_ps(scale);
+ const __m128i vzero_point = _mm_set1_epi16((short) (uint16_t) zero_point);
+ const __m128i vqmin = _mm_set1_epi8((char) qmin);
+ const __m128i vqmax = _mm_set1_epi8((char) qmax);
+ for (; n != 0; n -= 16) {
+ const __m128i x = _mm_loadu_si128((const __m128i*) input);
+ const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+ const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+ const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+ input += 16;
+
+ // Convert int32_t input to FP32 and multiply by FP32 scale.
+ // Both operations involve statistically unbiased roundings (with default MXCSR rounding mode):
+ // - Large int32_t values can't be exactly represented as FP32. CVTDQ2PS instruction on x86 would round it
+ // according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
+ // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+ // to nearest FP32 value with ties to even with default MXCSR rounding mode.
+ const __m128 x_scaled = _mm_mul_ps(_mm_cvtepi32_ps(x), vscale);
+ const __m128 y_scaled = _mm_mul_ps(_mm_cvtepi32_ps(y), vscale);
+ const __m128 z_scaled = _mm_mul_ps(_mm_cvtepi32_ps(z), vscale);
+ const __m128 w_scaled = _mm_mul_ps(_mm_cvtepi32_ps(w), vscale);
+
+ // Convert scaled FP32 result to int32_t using CVTPS2DQ instruction from x86 SSE2. CVTPS2DQ instruction rounds
+ // result according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
+ // However, when conversion overflows, it produces INT32_MIN as a result. For large positive inputs the result
+ // of conversion can become negative, which affects the final requantization result. Note that on x86 SSE2 we
+ // have e.g. int32_t(float(INT32_MAX)) == INT32_MIN! This happens because float(INT32_MAX) rounds to 2**31,
+ // which overflows int32_t when it is converted back to integer.
+ //
+ // Thankfully, we can prove that overflow never happens in this requantization scheme. The largest positive
+ // input is INT32_MAX (2**31 - 1), which turns into 2**31 when converted to float. The largest scale value
+ // is 0x1.FFFFFEp-1. When multiplied together, the result is 2147483520 (compare to INT32_MAX = 2147483647),
+ // which fits into int32_t without overflow.
+ const __m128i x_rounded = _mm_cvtps_epi32(x_scaled);
+ const __m128i y_rounded = _mm_cvtps_epi32(y_scaled);
+ const __m128i z_rounded = _mm_cvtps_epi32(z_scaled);
+ const __m128i w_rounded = _mm_cvtps_epi32(w_scaled);
+
+ // Standard final sequence on x86 SSE2:
+ // - Pack to int16_t and saturate
+ // - Add zero point
+ // - Clamp between qmin and qmax
+ // - Pack to int8_t and saturate
+ const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_rounded, y_rounded), vzero_point);
+ const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_rounded, w_rounded), vzero_point);
+ const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin);
+ const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin);
+ const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped);
+
+ // 4x CVTDQ2PS
+ // 4x MULPS
+ // 4x CVTPS2DQ
+ // 2x PACKSSDW
+ // 2x PADDSW
+ // 2x PMAXSW
+ // 2x PMINSW
+ // 1x PACKSSWB
+ // ---------------------
+ // 21 instructions total
+
+ _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+ output += 16;
+ }
+}
diff --git a/src/qs8-requantization/fp32-sse4.c b/src/qs8-requantization/fp32-sse4.c
new file mode 100644
index 0000000..1df6463
--- /dev/null
+++ b/src/qs8-requantization/fp32-sse4.c
@@ -0,0 +1,93 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <nmmintrin.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__sse4(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const __m128 vscale = _mm_set1_ps(scale);
+ const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+ const __m128i vqmin = _mm_set1_epi8((char) qmin);
+ const __m128i vqmax = _mm_set1_epi8((char) qmax);
+ for (; n != 0; n -= 16) {
+ const __m128i x = _mm_loadu_si128((const __m128i*) input);
+ const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+ const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+ const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+ input += 16;
+
+ // Convert int32_t input to FP32 and multiply by FP32 scale.
+ // Both operations involve statistically unbiased roundings (with default MXCSR rounding mode):
+ // - Large int32_t values can't be exactly represented as FP32. CVTDQ2PS instruction on x86 would round it
+ // according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
+ // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+ // to nearest FP32 value with ties to even with default MXCSR rounding mode.
+ const __m128 x_scaled = _mm_mul_ps(_mm_cvtepi32_ps(x), vscale);
+ const __m128 y_scaled = _mm_mul_ps(_mm_cvtepi32_ps(y), vscale);
+ const __m128 z_scaled = _mm_mul_ps(_mm_cvtepi32_ps(z), vscale);
+ const __m128 w_scaled = _mm_mul_ps(_mm_cvtepi32_ps(w), vscale);
+
+ // Convert scaled FP32 result to int32_t using CVTPS2DQ instruction from x86 SSE2. CVTPS2DQ instruction rounds
+ // result according to nearest FP32 value with ties to even (assuming default MXCSR rounding mode).
+ // However, when conversion overflows, it produces INT32_MIN as a result. For large positive inputs the result
+ // of conversion can become negative, which affects the final requantization result. Note that on x86 SSE2 we
+ // have e.g. int32_t(float(INT32_MAX)) == INT32_MIN! This happens because float(INT32_MAX) rounds to 2**31,
+ // which overflows int32_t when it is converted back to integer.
+ //
+ // Thankfully, we can prove that overflow never happens in this requantization scheme. The largest positive
+ // input is INT32_MAX (2**31 - 1), which turns into 2**31 when converted to float. The largest scale value
+ // is 0x1.FFFFFEp-1. When multiplied together, the result is 2147483520 (compare to INT32_MAX = 2147483647),
+ // which fits into int32_t without overflow.
+ const __m128i x_rounded = _mm_cvtps_epi32(x_scaled);
+ const __m128i y_rounded = _mm_cvtps_epi32(y_scaled);
+ const __m128i z_rounded = _mm_cvtps_epi32(z_scaled);
+ const __m128i w_rounded = _mm_cvtps_epi32(w_scaled);
+
+ // Standard final sequence on x86 SSE2:
+ // - Pack to int16_t and saturate
+ // - Add zero point
+ // - Pack to int8_t and saturate
+ // - Clamp between qmin and qmax
+ const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_rounded, y_rounded), vzero_point);
+ const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_rounded, w_rounded), vzero_point);
+ const __m128i xyzw_packed = _mm_packs_epi16(xy_packed, zw_packed);
+ const __m128i xyzw_clamped = _mm_max_epi8(_mm_min_epi8(xyzw_packed, vqmax), vqmin);
+
+ // 4x CVTDQ2PS
+ // 4x MULPS
+ // 4x CVTPS2DQ
+ // 2x PACKSSDW
+ // 2x PADDSW
+ // 1x PACKSSWB
+ // 1x PMAXSB
+ // 1x PMINSB
+ // ---------------------
+ // 19 instructions total
+
+ _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+ output += 16;
+ }
+}
diff --git a/src/qs8-requantization/fp32-wasmsimd.c b/src/qs8-requantization/fp32-wasmsimd.c
new file mode 100644
index 0000000..b7458f8
--- /dev/null
+++ b/src/qs8-requantization/fp32-wasmsimd.c
@@ -0,0 +1,89 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <wasm_simd128.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_fp32__wasmsimd(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const v128_t vscale = wasm_f32x4_splat(scale);
+ const v128_t vfmin = wasm_f32x4_splat((float) ((int32_t) qmin - (int32_t) zero_point));
+ const v128_t vfmax = wasm_f32x4_splat((float) ((int32_t) qmax - (int32_t) zero_point));
+ const v128_t vfmagic = wasm_f32x4_splat(12582912.0f);
+ const v128_t vimagic = wasm_i32x4_splat(INT32_C(0x4B400000) - (int32_t) zero_point);
+ for (; n != 0; n -= 16) {
+ const v128_t x = wasm_v128_load(input);
+ const v128_t y = wasm_v128_load(input + 4);
+ const v128_t z = wasm_v128_load(input + 8);
+ const v128_t w = wasm_v128_load(input + 12);
+ input += 16;
+
+ // Convert int32_t input to FP32 and multiply by FP32 scale.
+ // Both operations involve statistically unbiased roundings:
+ // - Large int32_t values can't be exactly represented as FP32. The conversion instruction in WAsm SIMD would
+ // round it to nearest FP32 value with ties to even.
+ // - Product of two FP32 values is generally not exactly representation as an FP32 value, and will be rounded
+ // to nearest FP32 value with ties to even.
+ const v128_t x_scaled = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(x), vscale);
+ const v128_t y_scaled = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(y), vscale);
+ const v128_t z_scaled = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(z), vscale);
+ const v128_t w_scaled = wasm_f32x4_mul(wasm_f32x4_convert_i32x4(w), vscale);
+
+ // WAsm SIMD offers only a floating-point to integer conversion instruction with rounding towards zero.
+ // In lieu of conversion instruction with rounding-to-nearest-even, we use a magic trick of adding a large
+ // number (1.5 * 2**23) to scaled value to cause rounding to integer, and then substracing this magic number as
+ // integer. This trick works only in a limited range (absolute value of input must be less than 2**22), so
+ // generally we have to clamp input to this range before using the magic. However, clamping to any smaller range
+ // works just as well, and thus we clamp to [qmin - zero point, qmax - zero point] range so that after we add
+ // zero point to the result, it gets into target [qmin, qmax] range.
+ const v128_t x_clamped = wasm_f32x4_min(wasm_f32x4_max(x_scaled, vfmin), vfmax);
+ const v128_t y_clamped = wasm_f32x4_min(wasm_f32x4_max(y_scaled, vfmin), vfmax);
+ const v128_t z_clamped = wasm_f32x4_min(wasm_f32x4_max(z_scaled, vfmin), vfmax);
+ const v128_t w_clamped = wasm_f32x4_min(wasm_f32x4_max(w_scaled, vfmin), vfmax);
+
+ // Conversion to integer using the "magic trick". Rounding is performed in the output of addition operation,
+ // and result is rounded to nearest even integer with ties to even.
+ const v128_t x_biased = wasm_i32x4_sub(wasm_f32x4_add(x_clamped, vfmagic), vimagic);
+ const v128_t y_biased = wasm_i32x4_sub(wasm_f32x4_add(y_clamped, vfmagic), vimagic);
+ const v128_t z_biased = wasm_i32x4_sub(wasm_f32x4_add(z_clamped, vfmagic), vimagic);
+ const v128_t w_biased = wasm_i32x4_sub(wasm_f32x4_add(w_clamped, vfmagic), vimagic);
+
+ // Select low 8 bits of each 32-bit integer in the vectors for the output.
+ // Since result is already clamped to [qmin, qmax] subrange of [0, 255], saturation is not needed.
+ const v128_t xy_packed = wasm_v16x8_shuffle(x_biased, y_biased, 0, 2, 4, 6, 8, 10, 12, 14);
+ const v128_t zw_packed = wasm_v16x8_shuffle(z_biased, w_biased, 0, 2, 4, 6, 8, 10, 12, 14);
+ const v128_t xyzw_packed = wasm_v8x16_shuffle(xy_packed, zw_packed, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+
+ // 4x f32x4.convert_i32x4_s
+ // 4x f32x4.mul
+ // 4x f32x4.max
+ // 4x f32x4.min
+ // 4x f32x4.add
+ // 4x i32x4.sub
+ // 3x v8x16.shuffle
+ // ---------------------
+ // 29 instructions total
+
+ wasm_v128_store(output, xyzw_packed);
+ output += 16;
+ }
+}
diff --git a/src/qs8-requantization/precise-neon.c b/src/qs8-requantization/precise-neon.c
new file mode 100644
index 0000000..b95c6de
--- /dev/null
+++ b/src/qs8-requantization/precise-neon.c
@@ -0,0 +1,166 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__neon(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const uint32_t scale_bits = fp32_to_bits(scale);
+ const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
+ const int32_t shift = 127 + 23 - (scale_bits >> 23);
+ assert(shift >= 24);
+ assert(shift < 56);
+
+#if defined(__aarch64__)
+ const int32x4_t vmultiplier = vdupq_n_s32(multiplier);
+#else
+ const int32x2_t vmultiplier = vdup_n_s32(multiplier);
+#endif
+ const int16x8_t vzero_point = vdupq_n_s16((int16_t) zero_point);
+ const int64x2_t vshift = vdupq_n_s64(-shift);
+ const int8x16_t vqmin = vdupq_n_s8(qmin);
+ const int8x16_t vqmax = vdupq_n_s8(qmax);
+ for (; n != 0; n -= 16) {
+ const int32x4_t x = vld1q_s32(input);
+ const int32x4_t y = vld1q_s32(input + 4);
+ const int32x4_t z = vld1q_s32(input + 8);
+ const int32x4_t w = vld1q_s32(input + 12);
+ input += 16;
+
+ const uint32x4_t x_neg_mask = vcltq_s32(x, vmovq_n_s32(0));
+ const uint32x4_t y_neg_mask = vcltq_s32(y, vmovq_n_s32(0));
+ const uint32x4_t z_neg_mask = vcltq_s32(z, vmovq_n_s32(0));
+ const uint32x4_t w_neg_mask = vcltq_s32(w, vmovq_n_s32(0));
+
+#if defined(__aarch64__)
+ const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vget_low_s32(vmultiplier));
+ const int64x2_t x23_product = vmull_high_s32(x, vmultiplier);
+ const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vget_low_s32(vmultiplier));
+ const int64x2_t y23_product = vmull_high_s32(y, vmultiplier);
+ const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vget_low_s32(vmultiplier));
+ const int64x2_t z23_product = vmull_high_s32(z, vmultiplier);
+ const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vget_low_s32(vmultiplier));
+ const int64x2_t w23_product = vmull_high_s32(w, vmultiplier);
+#else
+ const int64x2_t x01_product = vmull_s32(vget_low_s32(x), vmultiplier);
+ const int64x2_t x23_product = vmull_s32(vget_high_s32(x), vmultiplier);
+ const int64x2_t y01_product = vmull_s32(vget_low_s32(y), vmultiplier);
+ const int64x2_t y23_product = vmull_s32(vget_high_s32(y), vmultiplier);
+ const int64x2_t z01_product = vmull_s32(vget_low_s32(z), vmultiplier);
+ const int64x2_t z23_product = vmull_s32(vget_high_s32(z), vmultiplier);
+ const int64x2_t w01_product = vmull_s32(vget_low_s32(w), vmultiplier);
+ const int64x2_t w23_product = vmull_s32(vget_high_s32(w), vmultiplier);
+#endif
+
+#if defined(__aarch64__)
+ const int64x2_t x01_adjusted_product = vaddw_s32(x01_product, vreinterpret_s32_u32(vget_low_u32(x_neg_mask)));
+ const int64x2_t x23_adjusted_product = vaddw_high_s32(x23_product, vreinterpretq_s32_u32(x_neg_mask));
+ const int64x2_t y01_adjusted_product = vaddw_s32(y01_product, vreinterpret_s32_u32(vget_low_u32(y_neg_mask)));
+ const int64x2_t y23_adjusted_product = vaddw_high_s32(y23_product, vreinterpretq_s32_u32(y_neg_mask));
+ const int64x2_t z01_adjusted_product = vaddw_s32(z01_product, vreinterpret_s32_u32(vget_low_u32(z_neg_mask)));
+ const int64x2_t z23_adjusted_product = vaddw_high_s32(z23_product, vreinterpretq_s32_u32(z_neg_mask));
+ const int64x2_t w01_adjusted_product = vaddw_s32(w01_product, vreinterpret_s32_u32(vget_low_u32(w_neg_mask)));
+ const int64x2_t w23_adjusted_product = vaddw_high_s32(w23_product, vreinterpretq_s32_u32(w_neg_mask));
+#else
+ const int64x2_t x01_adjusted_product = vaddw_s32(x01_product, vreinterpret_s32_u32(vget_low_u32(x_neg_mask)));
+ const int64x2_t x23_adjusted_product = vaddw_s32(x23_product, vreinterpret_s32_u32(vget_high_u32(x_neg_mask)));
+ const int64x2_t y01_adjusted_product = vaddw_s32(y01_product, vreinterpret_s32_u32(vget_low_u32(y_neg_mask)));
+ const int64x2_t y23_adjusted_product = vaddw_s32(y23_product, vreinterpret_s32_u32(vget_high_u32(y_neg_mask)));
+ const int64x2_t z01_adjusted_product = vaddw_s32(z01_product, vreinterpret_s32_u32(vget_low_u32(z_neg_mask)));
+ const int64x2_t z23_adjusted_product = vaddw_s32(z23_product, vreinterpret_s32_u32(vget_high_u32(z_neg_mask)));
+ const int64x2_t w01_adjusted_product = vaddw_s32(w01_product, vreinterpret_s32_u32(vget_low_u32(w_neg_mask)));
+ const int64x2_t w23_adjusted_product = vaddw_s32(w23_product, vreinterpret_s32_u32(vget_high_u32(w_neg_mask)));
+#endif
+
+ const int64x2_t x01_scaled = vrshlq_s64(x01_adjusted_product, vshift);
+ const int64x2_t x23_scaled = vrshlq_s64(x23_adjusted_product, vshift);
+ const int64x2_t y01_scaled = vrshlq_s64(y01_adjusted_product, vshift);
+ const int64x2_t y23_scaled = vrshlq_s64(y23_adjusted_product, vshift);
+ const int64x2_t z01_scaled = vrshlq_s64(z01_adjusted_product, vshift);
+ const int64x2_t z23_scaled = vrshlq_s64(z23_adjusted_product, vshift);
+ const int64x2_t w01_scaled = vrshlq_s64(w01_adjusted_product, vshift);
+ const int64x2_t w23_scaled = vrshlq_s64(w23_adjusted_product, vshift);
+
+#ifdef __aarch64__
+ const int32x4_t x_scaled = vuzp1q_s32(vreinterpretq_s32_s64(x01_scaled), vreinterpretq_s32_s64(x23_scaled));
+ const int32x4_t y_scaled = vuzp1q_s32(vreinterpretq_s32_s64(y01_scaled), vreinterpretq_s32_s64(y23_scaled));
+ const int32x4_t z_scaled = vuzp1q_s32(vreinterpretq_s32_s64(z01_scaled), vreinterpretq_s32_s64(z23_scaled));
+ const int32x4_t w_scaled = vuzp1q_s32(vreinterpretq_s32_s64(w01_scaled), vreinterpretq_s32_s64(w23_scaled));
+
+ const int16x8_t xy_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(x_scaled), y_scaled), vzero_point);
+ const int16x8_t zw_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(z_scaled), w_scaled), vzero_point);
+ const int8x16_t xyzw_packed = vqmovn_high_s16(vqmovn_s16(xy_packed), zw_packed);
+#else
+ const int32x4_t x_scaled = vcombine_s32(vmovn_s64(x01_scaled), vmovn_s64(x23_scaled));
+ const int32x4_t y_scaled = vcombine_s32(vmovn_s64(y01_scaled), vmovn_s64(y23_scaled));
+ const int32x4_t z_scaled = vcombine_s32(vmovn_s64(z01_scaled), vmovn_s64(z23_scaled));
+ const int32x4_t w_scaled = vcombine_s32(vmovn_s64(w01_scaled), vmovn_s64(w23_scaled));
+
+ const int16x8_t xy_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(x_scaled), vqmovn_s32(y_scaled)), vzero_point);
+ const int16x8_t zw_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(z_scaled), vqmovn_s32(w_scaled)), vzero_point);
+ const int8x16_t xyzw_packed = vcombine_s8(vqmovn_s16(xy_packed), vqmovn_s16(zw_packed));
+#endif
+
+ const int8x16_t xyzw_clamped = vmaxq_s8(vminq_s8(xyzw_packed, vqmax), vqmin);
+
+ // AArch32 version:
+ // 4x VCLT.S32 Qd, Qm, #0
+ // 8x VMULL.S32 Qd, Dm, Dn
+ // 8x VADDW.S32 Qd, Qm, Dn
+ // 8x VRSHL.S32 Qd, Qm, Qn
+ // 8x VMOVN.S64 Dd, Qm
+ // 4x VQMOVN.S32 Dd, Qm
+ // 2x VADD.S16 Qd, Qm, Qn
+ // 2x VQMOVUN.S16 Dd, Qm
+ // 1x VMAX.U8 Qd, Qm, Qn
+ // 1x VMIN.U8 Qd, Qm, Qn
+ // ---------------------
+ // 46 instructions total
+ //
+ // AArch64 version:
+ // 4x CMLT Vd.4S, Vn.4S, #0
+ // 4x SMULL Vd.2D, Vn.2S, Vm.2S
+ // 4x SMULL2 Vd.2D, Vn.4S, Vm.4S
+ // 4x SADDW Vd.2D, Vn.2D, Vm.2S
+ // 4x SADDW2 Vd.2D, Vn.2D, Vm.4S
+ // 8x SRSHL Vd.2D, Vn.2D, Vm.2D
+ // 4x UZP1 Vd.4S, Vn.4S, Vm.4S
+ // 2x SQXTN Vd.4H, Vn.4S
+ // 2x SQXTN2 Vd.8H, Vn.4S
+ // 2x ADD Vd.8H, Vn.8H, Vm.8H
+ // 1x SQXTN Vd.8B, Vn.8H
+ // 1x SQXTN2 Vd.16B, Vn.8H
+ // 1x SMIN Vd.16B, Vn.16B, Vm.16B
+ // 1x SMAX Vd.16B, Vn.16B, Vm.16B
+ // ---------------------
+ // 42 instructions total
+
+ vst1q_s8(output, xyzw_clamped);
+ output += 16;
+ }
+}
diff --git a/src/qs8-requantization/precise-psimd.c b/src/qs8-requantization/precise-psimd.c
new file mode 100644
index 0000000..499e631
--- /dev/null
+++ b/src/qs8-requantization/precise-psimd.c
@@ -0,0 +1,139 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <psimd.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__psimd(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const uint32_t scale_bits = fp32_to_bits(scale);
+ const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000);
+ const uint32_t shift = 127 + 31 - (scale_bits >> 23);
+ assert(shift >= 32);
+ assert(shift < 64);
+ const uint64_t rounding = UINT64_C(1) << (shift - 1);
+
+ const psimd_u32 vmultiplier_lo = psimd_splat_u32(multiplier & UINT32_C(0x0000FFFF));
+ const psimd_u32 vmultiplier_hi = psimd_splat_u32(multiplier >> 16);
+ const psimd_s32 vzero_point = psimd_splat_s32((int32_t) zero_point);
+ const psimd_s32 vsmin = psimd_splat_s32((int32_t) qmin - (int32_t) zero_point);
+ const psimd_s32 vsmax = psimd_splat_s32((int32_t) qmax - (int32_t) zero_point);
+ const psimd_u32 vrounding_lo = psimd_splat_u32((uint32_t) rounding);
+ const psimd_u32 vrounding_hi = psimd_splat_u32((uint32_t) (rounding >> 32));
+ const psimd_u32 vshift = psimd_splat_u32(shift - 32);
+ for (; n != 0; n -= 16) {
+ const psimd_s32 x = psimd_load_s32(input);
+ const psimd_s32 y = psimd_load_s32(input + 4);
+ const psimd_s32 z = psimd_load_s32(input + 8);
+ const psimd_s32 w = psimd_load_s32(input + 12);
+ input += 16;
+
+ const psimd_s32 x_neg_mask = x >> psimd_splat_s32(31);
+ const psimd_s32 y_neg_mask = y >> psimd_splat_s32(31);
+ const psimd_s32 z_neg_mask = z >> psimd_splat_s32(31);
+ const psimd_s32 w_neg_mask = w >> psimd_splat_s32(31);
+
+ const psimd_u32 x_abs = (psimd_u32) ((x ^ x_neg_mask) - x_neg_mask);
+ const psimd_u32 y_abs = (psimd_u32) ((y ^ y_neg_mask) - y_neg_mask);
+ const psimd_u32 z_abs = (psimd_u32) ((z ^ z_neg_mask) - z_neg_mask);
+ const psimd_u32 w_abs = (psimd_u32) ((w ^ w_neg_mask) - w_neg_mask);
+
+ const psimd_u32 x_abs_lo = x_abs & psimd_splat_u32(UINT32_C(0x0000FFFF));
+ const psimd_u32 x_abs_hi = x_abs >> psimd_splat_u32(16);
+ const psimd_u32 y_abs_lo = y_abs & psimd_splat_u32(UINT32_C(0x0000FFFF));
+ const psimd_u32 y_abs_hi = y_abs >> psimd_splat_u32(16);
+ const psimd_u32 z_abs_lo = z_abs & psimd_splat_u32(UINT32_C(0x0000FFFF));
+ const psimd_u32 z_abs_hi = z_abs >> psimd_splat_u32(16);
+ const psimd_u32 w_abs_lo = w_abs & psimd_splat_u32(UINT32_C(0x0000FFFF));
+ const psimd_u32 w_abs_hi = w_abs >> psimd_splat_u32(16);
+
+ const psimd_u32 x_product_ll = x_abs_lo * vmultiplier_lo;
+ const psimd_u32 y_product_ll = y_abs_lo * vmultiplier_lo;
+ const psimd_u32 z_product_ll = z_abs_lo * vmultiplier_lo;
+ const psimd_u32 w_product_ll = w_abs_lo * vmultiplier_lo;
+
+ const psimd_u32 x_product_lh = x_abs_lo * vmultiplier_hi + (x_product_ll >> psimd_splat_u32(16));
+ const psimd_u32 y_product_lh = y_abs_lo * vmultiplier_hi + (y_product_ll >> psimd_splat_u32(16));
+ const psimd_u32 z_product_lh = z_abs_lo * vmultiplier_hi + (z_product_ll >> psimd_splat_u32(16));
+ const psimd_u32 w_product_lh = w_abs_lo * vmultiplier_hi + (w_product_ll >> psimd_splat_u32(16));
+
+ const psimd_u32 x_product_hl = x_abs_hi * vmultiplier_lo + (x_product_lh & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+ const psimd_u32 y_product_hl = y_abs_hi * vmultiplier_lo + (y_product_lh & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+ const psimd_u32 z_product_hl = z_abs_hi * vmultiplier_lo + (z_product_lh & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+ const psimd_u32 w_product_hl = w_abs_hi * vmultiplier_lo + (w_product_lh & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+
+ const psimd_u32 x_product_lo =
+ (x_product_hl << psimd_splat_u32(16)) + (x_product_ll & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+ const psimd_u32 y_product_lo =
+ (y_product_hl << psimd_splat_u32(16)) + (y_product_ll & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+ const psimd_u32 z_product_lo =
+ (z_product_hl << psimd_splat_u32(16)) + (z_product_ll & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+ const psimd_u32 w_product_lo =
+ (w_product_hl << psimd_splat_u32(16)) + (w_product_ll & psimd_splat_u32(UINT32_C(0x0000FFFF)));
+
+ const psimd_u32 x_product_hi =
+ x_abs_hi * vmultiplier_hi + (x_product_lh >> psimd_splat_u32(16)) + (x_product_hl >> psimd_splat_u32(16));
+ const psimd_u32 y_product_hi =
+ y_abs_hi * vmultiplier_hi + (y_product_lh >> psimd_splat_u32(16)) + (y_product_hl >> psimd_splat_u32(16));
+ const psimd_u32 z_product_hi =
+ z_abs_hi * vmultiplier_hi + (z_product_lh >> psimd_splat_u32(16)) + (z_product_hl >> psimd_splat_u32(16));
+ const psimd_u32 w_product_hi =
+ w_abs_hi * vmultiplier_hi + (w_product_lh >> psimd_splat_u32(16)) + (w_product_hl >> psimd_splat_u32(16));
+
+ const psimd_u32 x_adjusted_product =
+ (x_product_hi + vrounding_hi) - ((psimd_s32) (x_product_lo & vrounding_lo) >> psimd_splat_s32(31));
+ const psimd_u32 y_adjusted_product =
+ (y_product_hi + vrounding_hi) - ((psimd_s32) (y_product_lo & vrounding_lo) >> psimd_splat_s32(31));
+ const psimd_u32 z_adjusted_product =
+ (z_product_hi + vrounding_hi) - ((psimd_s32) (z_product_lo & vrounding_lo) >> psimd_splat_s32(31));
+ const psimd_u32 w_adjusted_product =
+ (w_product_hi + vrounding_hi) - ((psimd_s32) (w_product_lo & vrounding_lo) >> psimd_splat_s32(31));
+
+ const psimd_u32 x_abs_scaled = x_adjusted_product >> vshift;
+ const psimd_u32 y_abs_scaled = y_adjusted_product >> vshift;
+ const psimd_u32 z_abs_scaled = z_adjusted_product >> vshift;
+ const psimd_u32 w_abs_scaled = w_adjusted_product >> vshift;
+
+ const psimd_s32 x_scaled = (psimd_s32) (x_abs_scaled ^ x_neg_mask) - x_neg_mask;
+ const psimd_s32 y_scaled = (psimd_s32) (y_abs_scaled ^ y_neg_mask) - y_neg_mask;
+ const psimd_s32 z_scaled = (psimd_s32) (z_abs_scaled ^ z_neg_mask) - z_neg_mask;
+ const psimd_s32 w_scaled = (psimd_s32) (w_abs_scaled ^ w_neg_mask) - w_neg_mask;
+
+ const psimd_s32 x_clamped = psimd_max_s32(psimd_min_s32(x_scaled, vsmax), vsmin) + vzero_point;
+ const psimd_s32 y_clamped = psimd_max_s32(psimd_min_s32(y_scaled, vsmax), vsmin) + vzero_point;
+ const psimd_s32 z_clamped = psimd_max_s32(psimd_min_s32(z_scaled, vsmax), vsmin) + vzero_point;
+ const psimd_s32 w_clamped = psimd_max_s32(psimd_min_s32(w_scaled, vsmax), vsmin) + vzero_point;
+
+ const psimd_s16 xy_clamped = psimd_concat_even_s16((psimd_s16) x_clamped, (psimd_s16) y_clamped);
+ const psimd_s16 zw_clamped = psimd_concat_even_s16((psimd_s16) z_clamped, (psimd_s16) w_clamped);
+
+ const psimd_s8 xyzw_clamped = psimd_concat_even_s8((psimd_s8) xy_clamped, (psimd_s8) zw_clamped);
+
+ psimd_store_s8(output, xyzw_clamped);
+ output += 16;
+ }
+}
diff --git a/src/qs8-requantization/precise-scalar-signed64.c b/src/qs8-requantization/precise-scalar-signed64.c
new file mode 100644
index 0000000..8634266
--- /dev/null
+++ b/src/qs8-requantization/precise-scalar-signed64.c
@@ -0,0 +1,95 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/scalar-utils.h>
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__scalar_signed64(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 4 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const uint32_t scale_bits = fp32_to_bits(scale);
+ const int32_t multiplier = ((int32_t) scale_bits & INT32_C(0x007FFFFF)) | INT32_C(0x00800000);
+ const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+ assert(shift >= 24);
+ assert(shift < 56);
+
+ const int64_t rounding = INT64_C(1) << (shift - 1);
+ const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
+ const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
+ for (; n != 0; n -= 4) {
+ const int32_t x = input[0];
+ const int32_t y = input[1];
+ const int32_t z = input[2];
+ const int32_t w = input[3];
+ input += 4;
+
+ // Compute full 64-bit product of signed 32-bit factors.
+ //
+ // Note: multiplier can be treated as either signed or unsigned.
+ const int64_t x_product = (int64_t) x * (int64_t) multiplier;
+ const int64_t y_product = (int64_t) y * (int64_t) multiplier;
+ const int64_t z_product = (int64_t) z * (int64_t) multiplier;
+ const int64_t w_product = (int64_t) w * (int64_t) multiplier;
+
+ // Adjust product before subsequent shift with rounding up to simulate shift with rounding away from zero.
+ const int64_t x_adjusted_product = x_product - (int64_t)(x < 0);
+ const int64_t y_adjusted_product = y_product - (int64_t)(y < 0);
+ const int64_t z_adjusted_product = z_product - (int64_t)(z < 0);
+ const int64_t w_adjusted_product = w_product - (int64_t)(w < 0);
+
+ // Arithmetically shift the full 64-bit product right with rounding.
+ // Rounding is performed towards closest integer, with midpoints rounded up.
+ //
+ // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
+ // "right shift with rounding" instruction each line below can be represented by just one such instruction
+ // (e.g. VRSHL.S64 on ARM NEON, SRSHL in ARM64 Advanced SIMD).
+ const int32_t x_scaled = (int32_t) asr_s64(x_adjusted_product + rounding, shift);
+ const int32_t y_scaled = (int32_t) asr_s64(y_adjusted_product + rounding, shift);
+ const int32_t z_scaled = (int32_t) asr_s64(z_adjusted_product + rounding, shift);
+ const int32_t w_scaled = (int32_t) asr_s64(w_adjusted_product + rounding, shift);
+
+ // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
+ const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
+ const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
+ const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
+ const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
+
+ // Add zero point to clamped value.
+ // The result is guaranteed to be in [qmin, qmax] range.
+ //
+ // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+ // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
+ const int32_t x_biased = x_clamped + zero_point;
+ const int32_t y_biased = y_clamped + zero_point;
+ const int32_t z_biased = z_clamped + zero_point;
+ const int32_t w_biased = w_clamped + zero_point;
+
+ output[0] = (int8_t) x_biased;
+ output[1] = (int8_t) y_biased;
+ output[2] = (int8_t) z_biased;
+ output[3] = (int8_t) w_biased;
+ output += 4;
+ }
+}
diff --git a/src/qs8-requantization/precise-scalar-unsigned32.c b/src/qs8-requantization/precise-scalar-unsigned32.c
new file mode 100644
index 0000000..d02d828
--- /dev/null
+++ b/src/qs8-requantization/precise-scalar-unsigned32.c
@@ -0,0 +1,130 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/scalar-utils.h>
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__scalar_unsigned32(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 4 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const uint32_t scale_bits = fp32_to_bits(scale);
+ const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000);
+ const uint32_t shift = 127 + 31 - (scale_bits >> 23);
+ assert(shift >= 32);
+ assert(shift < 64);
+
+ const uint64_t rounding = UINT64_C(1) << (shift - 1);
+ const uint32_t rounding_hi = (uint32_t)(rounding >> 32);
+ const uint32_t rounding_lo = (uint32_t) rounding;
+ const uint32_t shift_minus_32 = shift - 32;
+ const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
+ const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
+ for (; n != 0; n -= 4) {
+ const int32_t x = input[0];
+ const int32_t y = input[1];
+ const int32_t z = input[2];
+ const int32_t w = input[3];
+ input += 4;
+
+ // Compute absolute value of input as unsigned 32-bit int.
+ // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
+ const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x;
+ const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y;
+ const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z;
+ const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w;
+
+ // Compute full 64-bit product of 32-bit factors.
+ const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier;
+ const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier;
+ const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier;
+ const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier;
+
+ // Shift the full 64-bit product right with rounding.
+ // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+ //
+ // Generally, this operation requires both 64-bit addition and 64-bit shift, but we use two tricks to replace
+ // 64-bit operations with 32-bit operations.
+ //
+ // To avoid full 64-bit addition we make use of three facts:
+ // - 64-bit rounding value added before the shift is a power of 2, and thus has only one bit set.
+ // - When 0x1.0p-32f <= scale < 0x1.0p-31f, then the non-zero bit in rounding is in the low 32 bits, and
+ // rounding is exactly 0x80000000 (2**31), because rounding is 2**(scale-1) and scale >= 32. In this case,
+ // addition of rounding can affect high 32 bits of the product only through overflow, which happens if
+ // low 32-bit part of the product equals or exceeds 0x80000000. We can reformulate the latter condition
+ // as low 32-bit part of the product has the bit 31 set, and then overflow happens if both the low 32-bit part
+ // of the product and the low 32-bit part of the rounding value have bit 31 set. Since 32-bit numbers with the
+ // bit 31 set are negative when interpreted as signed integers, we can check the overflow condition as
+ // (int32_t) (LOW(product) & LOW(rounding)) < 0
+ // - When 0x1.0p-31f <= scale < 1.0f, then the non-zero bit is in the high 32 bits of rounding. We just need
+ // to do 32-bit addition of high 32 bits of rounding and high 32 bits of product. This addition never
+ // overflows because product <= 0x80000000 * 0xFFFFFF00 < 2**63 and rounding = 2**(scale-1) <= 2**62.
+ //
+ // To avoid full 64-bit shift, we leverage the fact that shift >= 32, and do it in two steps:
+ // - Shift by 32, which can be implemented by extacting the high 32-bit word on 32-bit systems.
+ // - Shift by (shift - 32), which can be implemented as a 32-bit shift of high word of addition result.
+ const uint32_t x_carry_lo = (uint32_t) ((int32_t) ((uint32_t) x_product & rounding_lo) < 0);
+ const uint32_t y_carry_lo = (uint32_t) ((int32_t) ((uint32_t) y_product & rounding_lo) < 0);
+ const uint32_t z_carry_lo = (uint32_t) ((int32_t) ((uint32_t) z_product & rounding_lo) < 0);
+ const uint32_t w_carry_lo = (uint32_t) ((int32_t) ((uint32_t) w_product & rounding_lo) < 0);
+
+ const uint32_t x_product_hi = (uint32_t) (x_product >> 32);
+ const uint32_t y_product_hi = (uint32_t) (y_product >> 32);
+ const uint32_t z_product_hi = (uint32_t) (z_product >> 32);
+ const uint32_t w_product_hi = (uint32_t) (w_product >> 32);
+
+ const uint32_t x_abs_scaled = (uint32_t) (x_product_hi + rounding_hi + x_carry_lo) >> shift_minus_32;
+ const uint32_t y_abs_scaled = (uint32_t) (y_product_hi + rounding_hi + y_carry_lo) >> shift_minus_32;
+ const uint32_t z_abs_scaled = (uint32_t) (z_product_hi + rounding_hi + z_carry_lo) >> shift_minus_32;
+ const uint32_t w_abs_scaled = (uint32_t) (w_product_hi + rounding_hi + w_carry_lo) >> shift_minus_32;
+
+ // Copy the sign of input to scaled absolute input value.
+ const int32_t x_scaled = (int32_t) (x >= 0 ? x_abs_scaled : -x_abs_scaled);
+ const int32_t y_scaled = (int32_t) (y >= 0 ? y_abs_scaled : -y_abs_scaled);
+ const int32_t z_scaled = (int32_t) (z >= 0 ? z_abs_scaled : -z_abs_scaled);
+ const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled);
+
+ // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
+ const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
+ const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
+ const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
+ const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
+
+ // Add zero point to clamped value.
+ // The result is guaranteed to be in [qmin, qmax] range.
+ //
+ // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+ // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
+ const int32_t x_biased = x_clamped + zero_point;
+ const int32_t y_biased = y_clamped + zero_point;
+ const int32_t z_biased = z_clamped + zero_point;
+ const int32_t w_biased = w_clamped + zero_point;
+
+ output[0] = (int8_t) x_biased;
+ output[1] = (int8_t) y_biased;
+ output[2] = (int8_t) z_biased;
+ output[3] = (int8_t) w_biased;
+ output += 4;
+ }
+}
diff --git a/src/qs8-requantization/precise-scalar-unsigned64.c b/src/qs8-requantization/precise-scalar-unsigned64.c
new file mode 100644
index 0000000..778e95b
--- /dev/null
+++ b/src/qs8-requantization/precise-scalar-unsigned64.c
@@ -0,0 +1,102 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/scalar-utils.h>
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__scalar_unsigned64(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 4 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const uint32_t scale_bits = fp32_to_bits(scale);
+ const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
+ const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+ assert(shift >= 24);
+ assert(shift < 56);
+
+ const uint64_t rounding = UINT64_C(1) << (shift - 1);
+ const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
+ const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
+ for (; n != 0; n -= 4) {
+ const int32_t x = input[0];
+ const int32_t y = input[1];
+ const int32_t z = input[2];
+ const int32_t w = input[3];
+ input += 4;
+
+ // Compute absolute value of input as unsigned 32-bit int.
+ // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
+ const uint32_t x_abs = (x >= 0) ? (uint32_t) x : -(uint32_t) x;
+ const uint32_t y_abs = (y >= 0) ? (uint32_t) y : -(uint32_t) y;
+ const uint32_t z_abs = (z >= 0) ? (uint32_t) z : -(uint32_t) z;
+ const uint32_t w_abs = (w >= 0) ? (uint32_t) w : -(uint32_t) w;
+
+ // Compute full 64-bit product of 32-bit factors.
+ const uint64_t x_product = (uint64_t) x_abs * (uint64_t) multiplier;
+ const uint64_t y_product = (uint64_t) y_abs * (uint64_t) multiplier;
+ const uint64_t z_product = (uint64_t) z_abs * (uint64_t) multiplier;
+ const uint64_t w_product = (uint64_t) w_abs * (uint64_t) multiplier;
+
+ // Shift the full 64-bit product right with rounding.
+ // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+ //
+ // Note that although rounding is precomputed, it is dependent on shift value, and on processors with 64-bit
+ // "right shift with rounding" instruction each line below can be represented by just one such instruction
+ // (e.g. VRSHL.U64 on ARM NEON, URSHL in ARM64 Advanced SIMD).
+ const uint32_t x_abs_scaled = (uint32_t) ((x_product + rounding) >> shift);
+ const uint32_t y_abs_scaled = (uint32_t) ((y_product + rounding) >> shift);
+ const uint32_t z_abs_scaled = (uint32_t) ((z_product + rounding) >> shift);
+ const uint32_t w_abs_scaled = (uint32_t) ((w_product + rounding) >> shift);
+
+ // Copy the sign of input to scaled absolute input value.
+ //
+ // On x86 processors with SSSE3 instruction set, this operation nicely maps to PSIGND instruction.
+ const int32_t x_scaled = (int32_t) (x >= 0 ? x_abs_scaled : -x_abs_scaled);
+ const int32_t y_scaled = (int32_t) (y >= 0 ? y_abs_scaled : -y_abs_scaled);
+ const int32_t z_scaled = (int32_t) (z >= 0 ? z_abs_scaled : -z_abs_scaled);
+ const int32_t w_scaled = (int32_t) (w >= 0 ? w_abs_scaled : -w_abs_scaled);
+
+ // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
+ const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
+ const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
+ const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
+ const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
+
+ // Add zero point to clamped value.
+ // The result is guaranteed to be in [qmin, qmax] range.
+ //
+ // This addition can not be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+ // range, so addition of zero point (which can be up to 255) can overflow signed 32-bit integer.
+ const int32_t x_biased = x_clamped + zero_point;
+ const int32_t y_biased = y_clamped + zero_point;
+ const int32_t z_biased = z_clamped + zero_point;
+ const int32_t w_biased = w_clamped + zero_point;
+
+ output[0] = (int8_t) x_biased;
+ output[1] = (int8_t) y_biased;
+ output[2] = (int8_t) z_biased;
+ output[3] = (int8_t) w_biased;
+ output += 4;
+ }
+}
diff --git a/src/qs8-requantization/precise-sse2.c b/src/qs8-requantization/precise-sse2.c
new file mode 100644
index 0000000..192f64e
--- /dev/null
+++ b/src/qs8-requantization/precise-sse2.c
@@ -0,0 +1,131 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <emmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__sse2(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const uint32_t scale_bits = fp32_to_bits(scale);
+ const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
+ const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+ assert(shift >= 24);
+ assert(shift < 56);
+ const uint64_t rounding = UINT64_C(1) << (shift - 1);
+
+ const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+ const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+ const __m128i vqmin = _mm_set1_epi8((short) qmin);
+ const __m128i vqmax = _mm_set1_epi8((short) qmax);
+ const __m128i vshift = _mm_cvtsi32_si128((int) shift);
+ const __m128i vrounding = _mm_set1_epi64x(rounding);
+ for (; n != 0; n -= 16) {
+ const __m128i x = _mm_loadu_si128((const __m128i*) input);
+ const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+ const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+ const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+ input += 16;
+
+ const __m128i x_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), x);
+ const __m128i y_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), y);
+ const __m128i z_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), z);
+ const __m128i w_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), w);
+
+ const __m128i x_abs0123 = _mm_sub_epi32(_mm_xor_si128(x, x_neg_mask), x_neg_mask);
+ const __m128i y_abs0123 = _mm_sub_epi32(_mm_xor_si128(y, y_neg_mask), y_neg_mask);
+ const __m128i z_abs0123 = _mm_sub_epi32(_mm_xor_si128(z, z_neg_mask), z_neg_mask);
+ const __m128i w_abs0123 = _mm_sub_epi32(_mm_xor_si128(w, w_neg_mask), w_neg_mask);
+
+ const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+
+ const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier);
+ const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier);
+ const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier);
+ const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier);
+
+ const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier);
+ const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier);
+ const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier);
+ const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier);
+
+ const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshift);
+ const __m128i x_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(x_absmul13, vrounding), vshift);
+ const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshift);
+ const __m128i y_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(y_absmul13, vrounding), vshift);
+ const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshift);
+ const __m128i z_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(z_absmul13, vrounding), vshift);
+ const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshift);
+ const __m128i w_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(w_absmul13, vrounding), vshift);
+
+ const __m128i x_abs_scaled0213 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(x_abs_scaled02), _mm_castsi128_ps(x_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i y_abs_scaled0213 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(y_abs_scaled02), _mm_castsi128_ps(y_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i z_abs_scaled0213 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(z_abs_scaled02), _mm_castsi128_ps(z_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i w_abs_scaled0213 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(w_abs_scaled02), _mm_castsi128_ps(w_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+
+ const __m128i x_abs_scaled = _mm_shuffle_epi32(x_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i y_abs_scaled = _mm_shuffle_epi32(y_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i z_abs_scaled = _mm_shuffle_epi32(z_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i w_abs_scaled = _mm_shuffle_epi32(w_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m128i x_scaled = _mm_sub_epi32(_mm_xor_si128(x_abs_scaled, x_neg_mask), x_neg_mask);
+ const __m128i y_scaled = _mm_sub_epi32(_mm_xor_si128(y_abs_scaled, y_neg_mask), y_neg_mask);
+ const __m128i z_scaled = _mm_sub_epi32(_mm_xor_si128(z_abs_scaled, z_neg_mask), z_neg_mask);
+ const __m128i w_scaled = _mm_sub_epi32(_mm_xor_si128(w_abs_scaled, w_neg_mask), w_neg_mask);
+
+ const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+ const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+ const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin);
+ const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin);
+ const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped);
+
+ // 4x PXOR (setzero)
+ // 8x PSUBD
+ // 8x PXOR
+ // 8x PSHUFD
+ // 8x PMULUDQ
+ // 8x PSRLQ
+ // 8x PADDQ
+ // 4x SHUFPS
+ // 2x PACKSSDW
+ // 2x PADDSW
+ // 2x PMAXSW
+ // 2x PMINSW
+ // 1x PACKSSWB
+ // ---------------------
+ // 63 instructions total
+
+ _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+ output += 16;
+ }
+}
diff --git a/src/qs8-requantization/precise-sse4.c b/src/qs8-requantization/precise-sse4.c
new file mode 100644
index 0000000..2cb3a02
--- /dev/null
+++ b/src/qs8-requantization/precise-sse4.c
@@ -0,0 +1,117 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <smmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__sse4(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const uint32_t scale_bits = fp32_to_bits(scale);
+ const uint32_t multiplier = (scale_bits << 8) | UINT32_C(0x80000000);
+ const uint32_t shift = 127 + 31 - (scale_bits >> 23);
+ assert(shift >= 32);
+ assert(shift < 64);
+ const uint64_t rounding = UINT64_C(1) << (shift - 1);
+
+ const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+ const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+ const __m128i vqmin = _mm_set1_epi8((char) qmin);
+ const __m128i vqmax = _mm_set1_epi8((char) qmax);
+ const __m128i vshiftlo = _mm_cvtsi32_si128((int) shift);
+ const __m128i vshifthi = _mm_cvtsi32_si128((int) shift - 32);
+ const __m128i vrounding = _mm_set1_epi64x(rounding);
+ for (; n != 0; n -= 16) {
+ const __m128i x = _mm_loadu_si128((const __m128i*) input);
+ const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+ const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+ const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+ input += 16;
+
+ const __m128i x_abs0123 = _mm_abs_epi32(x);
+ const __m128i y_abs0123 = _mm_abs_epi32(y);
+ const __m128i z_abs0123 = _mm_abs_epi32(z);
+ const __m128i w_abs0123 = _mm_abs_epi32(w);
+
+ const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+
+ const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier);
+ const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier);
+ const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier);
+ const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier);
+
+ const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier);
+ const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier);
+ const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier);
+ const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier);
+
+ const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshiftlo);
+ const __m128i x_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(x_absmul13, vrounding), vshifthi);
+ const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshiftlo);
+ const __m128i y_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(y_absmul13, vrounding), vshifthi);
+ const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshiftlo);
+ const __m128i z_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(z_absmul13, vrounding), vshifthi);
+ const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshiftlo);
+ const __m128i w_abs_scaled13 = _mm_srl_epi32(_mm_add_epi64(w_absmul13, vrounding), vshifthi);
+
+ const __m128i x_abs_scaled = _mm_blend_epi16(x_abs_scaled02, x_abs_scaled13, 0xCC);
+ const __m128i y_abs_scaled = _mm_blend_epi16(y_abs_scaled02, y_abs_scaled13, 0xCC);
+ const __m128i z_abs_scaled = _mm_blend_epi16(z_abs_scaled02, z_abs_scaled13, 0xCC);
+ const __m128i w_abs_scaled = _mm_blend_epi16(w_abs_scaled02, w_abs_scaled13, 0xCC);
+
+ const __m128i x_scaled = _mm_sign_epi32(x_abs_scaled, x);
+ const __m128i y_scaled = _mm_sign_epi32(y_abs_scaled, y);
+ const __m128i z_scaled = _mm_sign_epi32(z_abs_scaled, z);
+ const __m128i w_scaled = _mm_sign_epi32(w_abs_scaled, w);
+
+ const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+ const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+ const __m128i xyzw_packed = _mm_packs_epi16(xy_packed, zw_packed);
+ const __m128i xyzw_clamped = _mm_max_epi8(_mm_min_epi8(xyzw_packed, vqmax), vqmin);
+
+ // 4x PABSD
+ // 4x PSHUFD
+ // 8x PMULUDQ
+ // 4x PSRLQ
+ // 4x PSRLD
+ // 8x PADDQ
+ // 4x PBLENDW
+ // 4x PSIGND
+ // 2x PACKSSDW
+ // 2x PADDSW
+ // 1x PACKSSWB
+ // 1x PMAXSB
+ // 1x PMINSB
+ // ---------------------
+ // 47 instructions total
+
+ _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+ output += 16;
+ }
+}
diff --git a/src/qs8-requantization/precise-ssse3.c b/src/qs8-requantization/precise-ssse3.c
new file mode 100644
index 0000000..78c2c76
--- /dev/null
+++ b/src/qs8-requantization/precise-ssse3.c
@@ -0,0 +1,125 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <tmmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_precise__ssse3(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const uint32_t scale_bits = fp32_to_bits(scale);
+ const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
+ const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+ assert(shift >= 24);
+ assert(shift < 56);
+ const uint64_t rounding = UINT64_C(1) << (shift - 1);
+
+ const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+ const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+ const __m128i vqmin = _mm_set1_epi8((char) qmin);
+ const __m128i vqmax = _mm_set1_epi8((char) qmax);
+ const __m128i vshift = _mm_cvtsi32_si128((int) shift);
+ const __m128i vrounding = _mm_set1_epi64x(rounding);
+ for (; n != 0; n -= 16) {
+ const __m128i x = _mm_loadu_si128((const __m128i*) input);
+ const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+ const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+ const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+ input += 16;
+
+ const __m128i x_abs0123 = _mm_abs_epi32(x);
+ const __m128i y_abs0123 = _mm_abs_epi32(y);
+ const __m128i z_abs0123 = _mm_abs_epi32(z);
+ const __m128i w_abs0123 = _mm_abs_epi32(w);
+
+ const __m128i x_abs1032 = _mm_shuffle_epi32(x_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i y_abs1032 = _mm_shuffle_epi32(y_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i z_abs1032 = _mm_shuffle_epi32(z_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i w_abs1032 = _mm_shuffle_epi32(w_abs0123, _MM_SHUFFLE(2, 3, 0, 1));
+
+ const __m128i x_absmul02 = _mm_mul_epu32(x_abs0123, vmultiplier);
+ const __m128i y_absmul02 = _mm_mul_epu32(y_abs0123, vmultiplier);
+ const __m128i z_absmul02 = _mm_mul_epu32(z_abs0123, vmultiplier);
+ const __m128i w_absmul02 = _mm_mul_epu32(w_abs0123, vmultiplier);
+
+ const __m128i x_absmul13 = _mm_mul_epu32(x_abs1032, vmultiplier);
+ const __m128i y_absmul13 = _mm_mul_epu32(y_abs1032, vmultiplier);
+ const __m128i z_absmul13 = _mm_mul_epu32(z_abs1032, vmultiplier);
+ const __m128i w_absmul13 = _mm_mul_epu32(w_abs1032, vmultiplier);
+
+ const __m128i x_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(x_absmul02, vrounding), vshift);
+ const __m128i x_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(x_absmul13, vrounding), vshift);
+ const __m128i y_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(y_absmul02, vrounding), vshift);
+ const __m128i y_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(y_absmul13, vrounding), vshift);
+ const __m128i z_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(z_absmul02, vrounding), vshift);
+ const __m128i z_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(z_absmul13, vrounding), vshift);
+ const __m128i w_abs_scaled02 = _mm_srl_epi64(_mm_add_epi64(w_absmul02, vrounding), vshift);
+ const __m128i w_abs_scaled13 = _mm_srl_epi64(_mm_add_epi64(w_absmul13, vrounding), vshift);
+
+ const __m128i x_abs_scaled0213 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(x_abs_scaled02), _mm_castsi128_ps(x_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i y_abs_scaled0213 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(y_abs_scaled02), _mm_castsi128_ps(y_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i z_abs_scaled0213 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(z_abs_scaled02), _mm_castsi128_ps(z_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i w_abs_scaled0213 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(w_abs_scaled02), _mm_castsi128_ps(w_abs_scaled13), _MM_SHUFFLE(2, 0, 2, 0)));
+
+ const __m128i x_abs_scaled = _mm_shuffle_epi32(x_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i y_abs_scaled = _mm_shuffle_epi32(y_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i z_abs_scaled = _mm_shuffle_epi32(z_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i w_abs_scaled = _mm_shuffle_epi32(w_abs_scaled0213, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m128i x_scaled = _mm_sign_epi32(x_abs_scaled, x);
+ const __m128i y_scaled = _mm_sign_epi32(y_abs_scaled, y);
+ const __m128i z_scaled = _mm_sign_epi32(z_abs_scaled, z);
+ const __m128i w_scaled = _mm_sign_epi32(w_abs_scaled, w);
+
+ const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+ const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+ const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin);
+ const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin);
+ const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped);
+
+ // 4x PABSD
+ // 8x PSHUFD
+ // 8x PMULUDQ
+ // 8x PSRLQ
+ // 8x PADDQ
+ // 4x SHUFPS
+ // 4x PSIGND
+ // 2x PACKSSDW
+ // 2x PADDSW
+ // 2x PMAXSW
+ // 2x PMINSW
+ // 1x PACKSSWB
+ // ---------------------
+ // 53 instructions total
+
+ _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+ output += 16;
+ }
+}
diff --git a/src/qs8-requantization/q31-neon.c b/src/qs8-requantization/q31-neon.c
new file mode 100644
index 0000000..9756293
--- /dev/null
+++ b/src/qs8-requantization/q31-neon.c
@@ -0,0 +1,125 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <arm_neon.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_q31__neon(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ // Compute requantization parameters.
+ const uint32_t scale_bits = fp32_to_bits(scale);
+
+ // Multiplier is in [0x40000000, 0x7FFFFF80] range.
+ const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+ assert(multiplier >= INT32_C(0x40000000));
+ assert(multiplier <= INT32_C(0x7FFFFF80));
+
+ // Shift is in [0, 31] range.
+ const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+ assert(shift >= 0);
+ assert(shift < 32);
+
+ const int32x4_t vmultiplier = vdupq_n_s32(multiplier);
+ const int16x8_t vzero_point = vdupq_n_s16((int16_t) zero_point);
+ const int32x4_t vshift = vdupq_n_s32(-shift);
+ const int32x4_t vshift_eq_0_mask = vreinterpretq_s32_u32(vceqq_s32(vshift, vmovq_n_s32(0)));
+ const int8x16_t vqmin = vdupq_n_s8(qmin);
+ const int8x16_t vqmax = vdupq_n_s8(qmax);
+ for (; n != 0; n -= 16) {
+ const int32x4_t x = vld1q_s32(input);
+ const int32x4_t y = vld1q_s32(input + 4);
+ const int32x4_t z = vld1q_s32(input + 8);
+ const int32x4_t w = vld1q_s32(input + 12);
+ input += 16;
+
+ // Directly use VQRDMULH/SQRDMULH instruction for Q31 multiplication with rounding.
+ // Although these instruction saturate out-of-range outputs, we never hit this case in requantization.
+ const int32x4_t x_product = vqrdmulhq_s32(x, vmultiplier);
+ const int32x4_t y_product = vqrdmulhq_s32(y, vmultiplier);
+ const int32x4_t z_product = vqrdmulhq_s32(z, vmultiplier);
+ const int32x4_t w_product = vqrdmulhq_s32(w, vmultiplier);
+
+ // Shift the 32-bit product right with rounding.
+ // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+ //
+ // We leverage the "right shift with rounding" instruction (VRSHL.S32 on ARM NEON, SRSHL in ARM64 Advanced SIMD) to
+ // do the shift. However, as this instruction rounds midpoints up, rather than away from zero, we adjust the input
+ // by subtracting 1 from negative values, but only if shift is non-zero.
+ const int32x4_t x_adjusted_product = vsraq_n_s32(x_product, vbicq_s32(x, vshift_eq_0_mask), 31);
+ const int32x4_t y_adjusted_product = vsraq_n_s32(y_product, vbicq_s32(y, vshift_eq_0_mask), 31);
+ const int32x4_t z_adjusted_product = vsraq_n_s32(z_product, vbicq_s32(z, vshift_eq_0_mask), 31);
+ const int32x4_t w_adjusted_product = vsraq_n_s32(w_product, vbicq_s32(w, vshift_eq_0_mask), 31);
+
+ const int32x4_t x_scaled = vrshlq_s32(x_adjusted_product, vshift);
+ const int32x4_t y_scaled = vrshlq_s32(y_adjusted_product, vshift);
+ const int32x4_t z_scaled = vrshlq_s32(z_adjusted_product, vshift);
+ const int32x4_t w_scaled = vrshlq_s32(w_adjusted_product, vshift);
+
+#ifdef __aarch64__
+ const int16x8_t xy_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(x_scaled), y_scaled), vzero_point);
+ const int16x8_t zw_packed = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(z_scaled), w_scaled), vzero_point);
+ const int8x16_t xyzw_packed = vqmovn_high_s16(vqmovn_s16(xy_packed), zw_packed);
+#else
+ const int16x8_t xy_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(x_scaled), vqmovn_s32(y_scaled)), vzero_point);
+ const int16x8_t zw_packed = vqaddq_s16(vcombine_s16(vqmovn_s32(z_scaled), vqmovn_s32(w_scaled)), vzero_point);
+ const int8x16_t xyzw_packed = vcombine_s8(vqmovn_s16(xy_packed), vqmovn_s16(zw_packed));
+#endif
+
+ const int8x16_t xyzw_clamped = vmaxq_s8(vminq_s8(xyzw_packed, vqmax), vqmin);
+
+ // AArch32 version:
+ // 4x VQRDMULH.S32 Qd, Qm, Qn
+ // 4x VAND Qd, Qm, Dn
+ // 4x VSRA.S32 Qd, Qm, #31
+ // 4x VRSHL.S32 Qd, Qm, Qn
+ // 4x VQMOVN.S32 Dd, Qm
+ // 2x VADD.S16 Qd, Qm, Qn
+ // 2x VQMOVN.S16 Dd, Qm
+ // 1x VMAX.S8 Qd, Qm, Qn
+ // 1x VMIN.S8 Qd, Qm, Qn
+ // ---------------------
+ // 26 instructions total
+ //
+ // AArch64 version:
+ // 4x SQRDMULH Vd.4S, Vn.4S, Vm.4S
+ // 4x AND Vd.16B, Vn.16B, Vm.16B
+ // 4x SSRA Vd.4S, Vn.4S, #31
+ // 4x SRSHL Vd.4S, Vn.4S, Vm.4S
+ // 2x SQXTN Vd.4H, Vn.4S
+ // 2x SQXTN2 Vd.8H, Vn.4S
+ // 2x ADD Vd.8H, Vn.8H, Vm.8H
+ // 1x SQXTN Vd.8B, Vn.8H
+ // 1x SQXTN2 Vd.16B, Vn.8H
+ // 1x SMIN Vd.16B, Vn.16B, Vm.16B
+ // 1x SMAX Vd.16B, Vn.16B, Vm.16B
+ // ---------------------
+ // 26 instructions total
+
+ vst1q_s8(output, xyzw_clamped);
+ output += 16;
+ }
+}
diff --git a/src/qs8-requantization/q31-scalar.c b/src/qs8-requantization/q31-scalar.c
new file mode 100644
index 0000000..ce843f0
--- /dev/null
+++ b/src/qs8-requantization/q31-scalar.c
@@ -0,0 +1,130 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/scalar-utils.h>
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_q31__scalar(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 4 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ // Compute requantization parameters.
+ const uint32_t scale_bits = fp32_to_bits(scale);
+
+ // Multiplier is in [0x40000000, 0x7FFFFF80] range.
+ const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+ assert(multiplier >= INT32_C(0x40000000));
+ assert(multiplier <= INT32_C(0x7FFFFF80));
+
+ // Shift is in [0, 31] range.
+ const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+ assert(shift >= 0);
+ assert(shift < 32);
+
+ const int64_t q31rounding = INT64_C(0x40000000);
+ const int32_t remainder_mask = (int32_t)((UINT32_C(1) << shift) - UINT32_C(1));
+ const int32_t threshold = (int32_t)((uint32_t) remainder_mask >> 1);
+ const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
+ const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
+ for (; n != 0; n -= 4) {
+ const int32_t x = input[0];
+ const int32_t y = input[1];
+ const int32_t z = input[2];
+ const int32_t w = input[3];
+ input += 4;
+
+ // Compute full 64-bit product of signed 32-bit factors.
+ //
+ // Note: multiplier can be treated as either signed or unsigned.
+ const int64_t x_product = (int64_t) x * (int64_t) multiplier;
+ const int64_t y_product = (int64_t) y * (int64_t) multiplier;
+ const int64_t z_product = (int64_t) z * (int64_t) multiplier;
+ const int64_t w_product = (int64_t) w * (int64_t) multiplier;
+
+ // Get the Q31 multiplication result by extracting bits 31-62 of the product, with rounding up.
+ // Add rounding value (0x40000000) and then shift right by 31 bits and extract the low 32-bit word.
+ // Note: casts to unsigned types are needed to avoid undefined behavior.
+ // Given the multiplier range, the result of Q31 multiplication is in [-2147483520, 2147483519] range.
+ const int32_t x_q31product = (int32_t) (uint32_t) ((uint64_t) (x_product + q31rounding) >> 31);
+ const int32_t y_q31product = (int32_t) (uint32_t) ((uint64_t) (y_product + q31rounding) >> 31);
+ const int32_t z_q31product = (int32_t) (uint32_t) ((uint64_t) (z_product + q31rounding) >> 31);
+ const int32_t w_q31product = (int32_t) (uint32_t) ((uint64_t) (w_product + q31rounding) >> 31);
+
+ // Arithmetically shift the adjusted product right with rounding.
+ // Rounding is performed towards closest integer, with midpoints rounded away from zero.
+ //
+ // Shift with correct rounding could be efficiently implemented by pre-adding rounding constant, but with input in
+ // [-2147483520, 2147483519] range and rounding constant up to 2**30 we can't rule out overflow. This limitation
+ // leaves us with 3 options:
+ // 1. Extend input to 64-bit signed integer, perform addition and shift on 64-bit integers, then truncate result
+ // to 32 bits.
+ // 2. Detect overflow and handle this situation separately. Note that overflow is possible only when input is
+ // positive, and even when addition of a rounding constant overflows 32-bit signed integer, it still doesn't
+ // overflow 32-bit unsigned integer. Thus, in case of signed overflow, we can compute the result using unsigned
+ // arithmetics, specifically using logical shift right instead of arithmetic shift right.
+ // 3. Performs arithmetic shift as is, which will produce division result rounded down. Then compute remainder of
+ // this division by a power of 2, and adjust the result. Result needs adjustment (increment by 1) when
+ // - input is positive, shift is non-zero, and remainder >= 2**(shift - 1), e.g. 10 >> 2 needs adjustment
+ // - input is negative, shift is non-zero, and remainder > 2**(shift - 1), e.g. -10 >> 2 doesn't need adjustment
+ // These conditions can be generalized as
+ // remainder + (input <= 0) > 2**(shift - 1)
+ // or equivalently
+ // remainder - (input < 0) > ((2**shift - 1) >> 1)
+ // When shift is 0, remainder is 0 as well, the last condition is always false, and no adjustment is done.
+ //
+ // Among these options, option 3 is the most performant across the board, although option 1 is promising for 64-bit
+ // instruction sets.
+ const int32_t x_remainder = (x_q31product & remainder_mask) - (int32_t) (x_q31product < 0);
+ const int32_t y_remainder = (y_q31product & remainder_mask) - (int32_t) (y_q31product < 0);
+ const int32_t z_remainder = (z_q31product & remainder_mask) - (int32_t) (z_q31product < 0);
+ const int32_t w_remainder = (w_q31product & remainder_mask) - (int32_t) (w_q31product < 0);
+
+ const int32_t x_scaled = asr_s32(x_q31product, shift) + (int32_t) (x_remainder > threshold);
+ const int32_t y_scaled = asr_s32(y_q31product, shift) + (int32_t) (y_remainder > threshold);
+ const int32_t z_scaled = asr_s32(z_q31product, shift) + (int32_t) (z_remainder > threshold);
+ const int32_t w_scaled = asr_s32(w_q31product, shift) + (int32_t) (w_remainder > threshold);
+
+ // Clamp scaled value with zero point between (qmin - zero point) and (qmax - zero point).
+ const int32_t x_clamped = x_scaled < smin ? smin : x_scaled > smax ? smax : x_scaled;
+ const int32_t y_clamped = y_scaled < smin ? smin : y_scaled > smax ? smax : y_scaled;
+ const int32_t z_clamped = z_scaled < smin ? smin : z_scaled > smax ? smax : z_scaled;
+ const int32_t w_clamped = w_scaled < smin ? smin : w_scaled > smax ? smax : w_scaled;
+
+ // Add zero point to clamped value.
+ // The result is guaranteed to be in [qmin, qmax] range.
+ //
+ // This addition can be safely done before clamping, because scaled values are in [-2147483520, 2147483519]
+ // range, so addition of zero point (which is in [-128, 127] range) can not overflow signed 32-bit integer.
+ const int32_t x_biased = x_clamped + zero_point;
+ const int32_t y_biased = y_clamped + zero_point;
+ const int32_t z_biased = z_clamped + zero_point;
+ const int32_t w_biased = w_clamped + zero_point;
+
+ output[0] = (int8_t) x_biased;
+ output[1] = (int8_t) y_biased;
+ output[2] = (int8_t) z_biased;
+ output[3] = (int8_t) w_biased;
+ output += 4;
+ }
+}
diff --git a/src/qs8-requantization/q31-sse2.c b/src/qs8-requantization/q31-sse2.c
new file mode 100644
index 0000000..c5a8500
--- /dev/null
+++ b/src/qs8-requantization/q31-sse2.c
@@ -0,0 +1,188 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <emmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_q31__sse2(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ // Compute requantization parameters.
+ const uint32_t scale_bits = fp32_to_bits(scale);
+
+ // Multiplier is in [0x40000000, 0x7FFFFF80] range.
+ const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+ assert(multiplier >= INT32_C(0x40000000));
+ assert(multiplier <= INT32_C(0x7FFFFF80));
+
+ // Shift is in [0, 31] range.
+ const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+ assert(shift >= 0);
+ assert(shift < 32);
+
+ const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+ const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+ const __m128i vqmin = _mm_set1_epi16((short) qmin);
+ const __m128i vqmax = _mm_set1_epi16((short) qmax);
+ const __m128i vshift = _mm_cvtsi32_si128((int) shift);
+ const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+ const __m128i vremainder_mask = _mm_set1_epi32((int) remainder_mask);
+ const __m128i vthreshold = _mm_set1_epi32((int) (remainder_mask >> 1));
+ const __m128i vq31rounding = _mm_set1_epi64x(UINT64_C(0x40000000));
+ for (; n != 0; n -= 16) {
+ const __m128i x = _mm_loadu_si128((const __m128i*) input);
+ const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+ const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+ const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+ input += 16;
+
+ const __m128i x_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), x);
+ const __m128i y_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), y);
+ const __m128i z_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), z);
+ const __m128i w_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), w);
+
+ const __m128i x_abs = _mm_sub_epi32(_mm_xor_si128(x, x_neg_mask), x_neg_mask);
+ const __m128i y_abs = _mm_sub_epi32(_mm_xor_si128(y, y_neg_mask), y_neg_mask);
+ const __m128i z_abs = _mm_sub_epi32(_mm_xor_si128(z, z_neg_mask), z_neg_mask);
+ const __m128i w_abs = _mm_sub_epi32(_mm_xor_si128(w, w_neg_mask), w_neg_mask);
+
+ const __m128i x_abs_rev = _mm_shuffle_epi32(x_abs, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i y_abs_rev = _mm_shuffle_epi32(y_abs, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i z_abs_rev = _mm_shuffle_epi32(z_abs, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i w_abs_rev = _mm_shuffle_epi32(w_abs, _MM_SHUFFLE(2, 3, 0, 1));
+
+ const __m128i x_abs_product_even = _mm_mul_epu32(x_abs, vmultiplier);
+ const __m128i y_abs_product_even = _mm_mul_epu32(y_abs, vmultiplier);
+ const __m128i z_abs_product_even = _mm_mul_epu32(z_abs, vmultiplier);
+ const __m128i w_abs_product_even = _mm_mul_epu32(w_abs, vmultiplier);
+
+ const __m128i x_neg_mask_even = _mm_shuffle_epi32(x_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i y_neg_mask_even = _mm_shuffle_epi32(y_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i z_neg_mask_even = _mm_shuffle_epi32(z_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i w_neg_mask_even = _mm_shuffle_epi32(w_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+
+ const __m128i x_product_even = _mm_sub_epi64(_mm_xor_si128(x_abs_product_even, x_neg_mask_even), x_neg_mask_even);
+ const __m128i y_product_even = _mm_sub_epi64(_mm_xor_si128(y_abs_product_even, y_neg_mask_even), y_neg_mask_even);
+ const __m128i z_product_even = _mm_sub_epi64(_mm_xor_si128(z_abs_product_even, z_neg_mask_even), z_neg_mask_even);
+ const __m128i w_product_even = _mm_sub_epi64(_mm_xor_si128(w_abs_product_even, w_neg_mask_even), w_neg_mask_even);
+
+ const __m128i x_rounded_product_even = _mm_add_epi64(x_product_even, vq31rounding);
+ const __m128i y_rounded_product_even = _mm_add_epi64(y_product_even, vq31rounding);
+ const __m128i z_rounded_product_even = _mm_add_epi64(z_product_even, vq31rounding);
+ const __m128i w_rounded_product_even = _mm_add_epi64(w_product_even, vq31rounding);
+
+ const __m128i x_abs_product_odd = _mm_mul_epu32(x_abs_rev, vmultiplier);
+ const __m128i y_abs_product_odd = _mm_mul_epu32(y_abs_rev, vmultiplier);
+ const __m128i z_abs_product_odd = _mm_mul_epu32(z_abs_rev, vmultiplier);
+ const __m128i w_abs_product_odd = _mm_mul_epu32(w_abs_rev, vmultiplier);
+
+ const __m128i x_neg_mask_odd = _mm_shuffle_epi32(x_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i y_neg_mask_odd = _mm_shuffle_epi32(y_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i z_neg_mask_odd = _mm_shuffle_epi32(z_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i w_neg_mask_odd = _mm_shuffle_epi32(w_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i x_product_odd = _mm_sub_epi64(_mm_xor_si128(x_abs_product_odd, x_neg_mask_odd), x_neg_mask_odd);
+ const __m128i y_product_odd = _mm_sub_epi64(_mm_xor_si128(y_abs_product_odd, y_neg_mask_odd), y_neg_mask_odd);
+ const __m128i z_product_odd = _mm_sub_epi64(_mm_xor_si128(z_abs_product_odd, z_neg_mask_odd), z_neg_mask_odd);
+ const __m128i w_product_odd = _mm_sub_epi64(_mm_xor_si128(w_abs_product_odd, w_neg_mask_odd), w_neg_mask_odd);
+
+ const __m128i x_rounded_product_odd = _mm_add_epi64(x_product_odd, vq31rounding);
+ const __m128i y_rounded_product_odd = _mm_add_epi64(y_product_odd, vq31rounding);
+ const __m128i z_rounded_product_odd = _mm_add_epi64(z_product_odd, vq31rounding);
+ const __m128i w_rounded_product_odd = _mm_add_epi64(w_product_odd, vq31rounding);
+
+ const __m128i x_q31product_even = _mm_srli_epi64(x_rounded_product_even, 31);
+ const __m128i x_q31product_odd = _mm_srli_epi64(x_rounded_product_odd, 31);
+ const __m128i y_q31product_even = _mm_srli_epi64(y_rounded_product_even, 31);
+ const __m128i y_q31product_odd = _mm_srli_epi64(y_rounded_product_odd, 31);
+ const __m128i z_q31product_even = _mm_srli_epi64(z_rounded_product_even, 31);
+ const __m128i z_q31product_odd = _mm_srli_epi64(z_rounded_product_odd, 31);
+ const __m128i w_q31product_even = _mm_srli_epi64(w_rounded_product_even, 31);
+ const __m128i w_q31product_odd = _mm_srli_epi64(w_rounded_product_odd, 31);
+
+ const __m128i x_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(x_q31product_even), _mm_castsi128_ps(x_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i y_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(y_q31product_even), _mm_castsi128_ps(y_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i z_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(z_q31product_even), _mm_castsi128_ps(z_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i w_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(w_q31product_even), _mm_castsi128_ps(w_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+
+ const __m128i x_q31product = _mm_shuffle_epi32(x_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i y_q31product = _mm_shuffle_epi32(y_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i z_q31product = _mm_shuffle_epi32(z_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i w_q31product = _mm_shuffle_epi32(w_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m128i x_remainder =
+ _mm_add_epi32(_mm_and_si128(x_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), x_q31product));
+ const __m128i y_remainder =
+ _mm_add_epi32(_mm_and_si128(y_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), y_q31product));
+ const __m128i z_remainder =
+ _mm_add_epi32(_mm_and_si128(z_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), z_q31product));
+ const __m128i w_remainder =
+ _mm_add_epi32(_mm_and_si128(w_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), w_q31product));
+
+ const __m128i x_scaled =
+ _mm_sub_epi32(_mm_sra_epi32(x_q31product, vshift), _mm_cmpgt_epi32(x_remainder, vthreshold));
+ const __m128i y_scaled =
+ _mm_sub_epi32(_mm_sra_epi32(y_q31product, vshift), _mm_cmpgt_epi32(y_remainder, vthreshold));
+ const __m128i z_scaled =
+ _mm_sub_epi32(_mm_sra_epi32(z_q31product, vshift), _mm_cmpgt_epi32(z_remainder, vthreshold));
+ const __m128i w_scaled =
+ _mm_sub_epi32(_mm_sra_epi32(w_q31product, vshift), _mm_cmpgt_epi32(w_remainder, vthreshold));
+
+ const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+ const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+ const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin);
+ const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin);
+ const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped);
+
+ // 16x PSHUFD
+ // 4x SHUFPS
+ // 8x PMULUDQ
+ // 8x PXOR (setzero)
+ // 12x PXOR
+ // 4x PAND
+ // 8x PADDQ
+ // 4x PADDD
+ // 8x PSUBQ
+ // 8x PSUBD
+ // 8x PSRLQ (immediate)
+ // 4x PSRAD (register)
+ // 12x PCMPGTD
+ // 2x PACKSSDW
+ // 2x PADDSW
+ // 2x PMAXSW
+ // 2x PMINSW
+ // 1x PACKSSWB
+ // ---------------------
+ // 113 instructions total
+
+ _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+ output += 16;
+ }
+}
diff --git a/src/qs8-requantization/q31-sse4.c b/src/qs8-requantization/q31-sse4.c
new file mode 100644
index 0000000..11dd400
--- /dev/null
+++ b/src/qs8-requantization/q31-sse4.c
@@ -0,0 +1,136 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <smmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_q31__sse4(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ // Compute requantization parameters.
+ const uint32_t scale_bits = fp32_to_bits(scale);
+
+ // Multiplier is in [0x40000000, 0x7FFFFF80] range.
+ const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+ assert(multiplier >= INT32_C(0x40000000));
+ assert(multiplier <= INT32_C(0x7FFFFF80));
+
+ // Shift is in [0, 31] range.
+ const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+ assert(shift >= 0);
+ assert(shift < 32);
+
+ const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+ const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+ const __m128i vqmin = _mm_set1_epi8((char) qmin);
+ const __m128i vqmax = _mm_set1_epi8((char) qmax);
+ const __m128i vshift = _mm_cvtsi32_si128((int) shift);
+ const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+ const __m128i vremainder_mask = _mm_set1_epi32((int) remainder_mask);
+ const __m128i vthreshold = _mm_set1_epi32((int) (remainder_mask >> 1));
+ const __m128i vq31rounding = _mm_set1_epi64x(UINT64_C(0x40000000));
+ for (; n != 0; n -= 16) {
+ const __m128i x = _mm_loadu_si128((const __m128i*) input);
+ const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+ const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+ const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+ input += 16;
+
+ const __m128i x_rev = _mm_shuffle_epi32(x, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i y_rev = _mm_shuffle_epi32(y, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i z_rev = _mm_shuffle_epi32(z, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i w_rev = _mm_shuffle_epi32(w, _MM_SHUFFLE(2, 3, 0, 1));
+
+ const __m128i x_product_even = _mm_add_epi64(_mm_mul_epi32(x, vmultiplier), vq31rounding);
+ const __m128i y_product_even = _mm_add_epi64(_mm_mul_epi32(y, vmultiplier), vq31rounding);
+ const __m128i z_product_even = _mm_add_epi64(_mm_mul_epi32(z, vmultiplier), vq31rounding);
+ const __m128i w_product_even = _mm_add_epi64(_mm_mul_epi32(w, vmultiplier), vq31rounding);
+
+ const __m128i x_product_odd = _mm_add_epi64(_mm_mul_epi32(x_rev, vmultiplier), vq31rounding);
+ const __m128i y_product_odd = _mm_add_epi64(_mm_mul_epi32(y_rev, vmultiplier), vq31rounding);
+ const __m128i z_product_odd = _mm_add_epi64(_mm_mul_epi32(z_rev, vmultiplier), vq31rounding);
+ const __m128i w_product_odd = _mm_add_epi64(_mm_mul_epi32(w_rev, vmultiplier), vq31rounding);
+
+ const __m128i x_q31product_even = _mm_srli_epi64(x_product_even, 31);
+ const __m128i x_q31product_odd = _mm_add_epi64(x_product_odd, x_product_odd);
+ const __m128i y_q31product_even = _mm_srli_epi64(y_product_even, 31);
+ const __m128i y_q31product_odd = _mm_add_epi64(y_product_odd, y_product_odd);
+ const __m128i z_q31product_even = _mm_srli_epi64(z_product_even, 31);
+ const __m128i z_q31product_odd = _mm_add_epi64(z_product_odd, z_product_odd);
+ const __m128i w_q31product_even = _mm_srli_epi64(w_product_even, 31);
+ const __m128i w_q31product_odd = _mm_add_epi64(w_product_odd, w_product_odd);
+
+ const __m128i x_q31product = _mm_blend_epi16(x_q31product_even, x_q31product_odd, 0xCC);
+ const __m128i y_q31product = _mm_blend_epi16(y_q31product_even, y_q31product_odd, 0xCC);
+ const __m128i z_q31product = _mm_blend_epi16(z_q31product_even, z_q31product_odd, 0xCC);
+ const __m128i w_q31product = _mm_blend_epi16(w_q31product_even, w_q31product_odd, 0xCC);
+
+ const __m128i x_remainder =
+ _mm_add_epi32(_mm_and_si128(x_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), x_q31product));
+ const __m128i y_remainder =
+ _mm_add_epi32(_mm_and_si128(y_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), y_q31product));
+ const __m128i z_remainder =
+ _mm_add_epi32(_mm_and_si128(z_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), z_q31product));
+ const __m128i w_remainder =
+ _mm_add_epi32(_mm_and_si128(w_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), w_q31product));
+
+ const __m128i x_scaled =
+ _mm_sub_epi32(_mm_sra_epi32(x_q31product, vshift), _mm_cmpgt_epi32(x_remainder, vthreshold));
+ const __m128i y_scaled =
+ _mm_sub_epi32(_mm_sra_epi32(y_q31product, vshift), _mm_cmpgt_epi32(y_remainder, vthreshold));
+ const __m128i z_scaled =
+ _mm_sub_epi32(_mm_sra_epi32(z_q31product, vshift), _mm_cmpgt_epi32(z_remainder, vthreshold));
+ const __m128i w_scaled =
+ _mm_sub_epi32(_mm_sra_epi32(w_q31product, vshift), _mm_cmpgt_epi32(w_remainder, vthreshold));
+
+ const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+ const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+ const __m128i xyzw_packed = _mm_packs_epi16(xy_packed, zw_packed);
+ const __m128i xyzw_clamped = _mm_max_epi8(_mm_min_epi8(xyzw_packed, vqmax), vqmin);
+
+ // 4x PSHUFD
+ // 8x PMULDQ
+ // 12x PADDQ
+ // 4x PADDD
+ // 4x PSUBD
+ // 4x PSLRQ (immediate)
+ // 4x PSRAD (register)
+ // 4x PBLENDW
+ // 4x PAND
+ // 4x PXOR (setzero)
+ // 8x PCMPGTD
+ // 2x PACKSSDW
+ // 2x PADDSW
+ // 1x PACKSSWB
+ // 1x PMAXSB
+ // 1x PMINSB
+ // ---------------------
+ // 67 instructions total
+
+ _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+ output += 16;
+ }
+}
diff --git a/src/qs8-requantization/q31-ssse3.c b/src/qs8-requantization/q31-ssse3.c
new file mode 100644
index 0000000..6dad4a9
--- /dev/null
+++ b/src/qs8-requantization/q31-ssse3.c
@@ -0,0 +1,189 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#include <tmmintrin.h>
+
+#include <fp16/bitcasts.h>
+
+#include <xnnpack/requantization-stubs.h>
+
+
+void xnn_qs8_requantize_q31__ssse3(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output)
+{
+ assert(n % 16 == 0);
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ // Compute requantization parameters.
+ const uint32_t scale_bits = fp32_to_bits(scale);
+
+ // Multiplier is in [0x40000000, 0x7FFFFF80] range.
+ const int32_t multiplier = (int32_t)(((scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000)) << 7);
+ assert(multiplier >= INT32_C(0x40000000));
+ assert(multiplier <= INT32_C(0x7FFFFF80));
+
+ // Shift is in [0, 31] range.
+ const int32_t shift = 127 + 31 - 32 - (fp32_to_bits(scale) >> 23);
+ assert(shift >= 0);
+ assert(shift < 32);
+
+ const __m128i vmultiplier = _mm_set1_epi32(multiplier);
+ const __m128i vzero_point = _mm_set1_epi16((short) zero_point);
+ const __m128i vqmin = _mm_set1_epi16((short) qmin);
+ const __m128i vqmax = _mm_set1_epi16((short) qmax);
+ const __m128i vshift = _mm_cvtsi32_si128((int) shift);
+ const uint32_t remainder_mask = (UINT32_C(1) << shift) - UINT32_C(1);
+ const __m128i vremainder_mask = _mm_set1_epi32((int) remainder_mask);
+ const __m128i vthreshold = _mm_set1_epi32((int) (remainder_mask >> 1));
+ const __m128i vq31rounding = _mm_set1_epi64x(UINT64_C(0x40000000));
+ for (; n != 0; n -= 16) {
+ const __m128i x = _mm_loadu_si128((const __m128i*) input);
+ const __m128i y = _mm_loadu_si128((const __m128i*) (input + 4));
+ const __m128i z = _mm_loadu_si128((const __m128i*) (input + 8));
+ const __m128i w = _mm_loadu_si128((const __m128i*) (input + 12));
+ input += 16;
+
+ const __m128i x_abs = _mm_abs_epi32(x);
+ const __m128i y_abs = _mm_abs_epi32(y);
+ const __m128i z_abs = _mm_abs_epi32(z);
+ const __m128i w_abs = _mm_abs_epi32(w);
+
+ const __m128i x_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), x);
+ const __m128i y_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), y);
+ const __m128i z_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), z);
+ const __m128i w_neg_mask = _mm_cmpgt_epi32(_mm_setzero_si128(), w);
+
+ const __m128i x_abs_rev = _mm_shuffle_epi32(x_abs, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i y_abs_rev = _mm_shuffle_epi32(y_abs, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i z_abs_rev = _mm_shuffle_epi32(z_abs, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m128i w_abs_rev = _mm_shuffle_epi32(w_abs, _MM_SHUFFLE(2, 3, 0, 1));
+
+ const __m128i x_abs_product_even = _mm_mul_epu32(x_abs, vmultiplier);
+ const __m128i y_abs_product_even = _mm_mul_epu32(y_abs, vmultiplier);
+ const __m128i z_abs_product_even = _mm_mul_epu32(z_abs, vmultiplier);
+ const __m128i w_abs_product_even = _mm_mul_epu32(w_abs, vmultiplier);
+
+ const __m128i x_neg_mask_even = _mm_shuffle_epi32(x_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i y_neg_mask_even = _mm_shuffle_epi32(y_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i z_neg_mask_even = _mm_shuffle_epi32(z_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+ const __m128i w_neg_mask_even = _mm_shuffle_epi32(w_neg_mask, _MM_SHUFFLE(2, 2, 0, 0));
+
+ const __m128i x_product_even = _mm_sub_epi64(_mm_xor_si128(x_abs_product_even, x_neg_mask_even), x_neg_mask_even);
+ const __m128i y_product_even = _mm_sub_epi64(_mm_xor_si128(y_abs_product_even, y_neg_mask_even), y_neg_mask_even);
+ const __m128i z_product_even = _mm_sub_epi64(_mm_xor_si128(z_abs_product_even, z_neg_mask_even), z_neg_mask_even);
+ const __m128i w_product_even = _mm_sub_epi64(_mm_xor_si128(w_abs_product_even, w_neg_mask_even), w_neg_mask_even);
+
+ const __m128i x_rounded_product_even = _mm_add_epi64(x_product_even, vq31rounding);
+ const __m128i y_rounded_product_even = _mm_add_epi64(y_product_even, vq31rounding);
+ const __m128i z_rounded_product_even = _mm_add_epi64(z_product_even, vq31rounding);
+ const __m128i w_rounded_product_even = _mm_add_epi64(w_product_even, vq31rounding);
+
+ const __m128i x_abs_product_odd = _mm_mul_epu32(x_abs_rev, vmultiplier);
+ const __m128i y_abs_product_odd = _mm_mul_epu32(y_abs_rev, vmultiplier);
+ const __m128i z_abs_product_odd = _mm_mul_epu32(z_abs_rev, vmultiplier);
+ const __m128i w_abs_product_odd = _mm_mul_epu32(w_abs_rev, vmultiplier);
+
+ const __m128i x_neg_mask_odd = _mm_shuffle_epi32(x_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i y_neg_mask_odd = _mm_shuffle_epi32(y_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i z_neg_mask_odd = _mm_shuffle_epi32(z_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+ const __m128i w_neg_mask_odd = _mm_shuffle_epi32(w_neg_mask, _MM_SHUFFLE(3, 3, 1, 1));
+
+ const __m128i x_product_odd = _mm_sub_epi64(_mm_xor_si128(x_abs_product_odd, x_neg_mask_odd), x_neg_mask_odd);
+ const __m128i y_product_odd = _mm_sub_epi64(_mm_xor_si128(y_abs_product_odd, y_neg_mask_odd), y_neg_mask_odd);
+ const __m128i z_product_odd = _mm_sub_epi64(_mm_xor_si128(z_abs_product_odd, z_neg_mask_odd), z_neg_mask_odd);
+ const __m128i w_product_odd = _mm_sub_epi64(_mm_xor_si128(w_abs_product_odd, w_neg_mask_odd), w_neg_mask_odd);
+
+ const __m128i x_rounded_product_odd = _mm_add_epi64(x_product_odd, vq31rounding);
+ const __m128i y_rounded_product_odd = _mm_add_epi64(y_product_odd, vq31rounding);
+ const __m128i z_rounded_product_odd = _mm_add_epi64(z_product_odd, vq31rounding);
+ const __m128i w_rounded_product_odd = _mm_add_epi64(w_product_odd, vq31rounding);
+
+ const __m128i x_q31product_even = _mm_srli_epi64(x_rounded_product_even, 31);
+ const __m128i x_q31product_odd = _mm_srli_epi64(x_rounded_product_odd, 31);
+ const __m128i y_q31product_even = _mm_srli_epi64(y_rounded_product_even, 31);
+ const __m128i y_q31product_odd = _mm_srli_epi64(y_rounded_product_odd, 31);
+ const __m128i z_q31product_even = _mm_srli_epi64(z_rounded_product_even, 31);
+ const __m128i z_q31product_odd = _mm_srli_epi64(z_rounded_product_odd, 31);
+ const __m128i w_q31product_even = _mm_srli_epi64(w_rounded_product_even, 31);
+ const __m128i w_q31product_odd = _mm_srli_epi64(w_rounded_product_odd, 31);
+
+ const __m128i x_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(x_q31product_even), _mm_castsi128_ps(x_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i y_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(y_q31product_even), _mm_castsi128_ps(y_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i z_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(z_q31product_even), _mm_castsi128_ps(z_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+ const __m128i w_q31product_0213 = _mm_castps_si128(_mm_shuffle_ps(
+ _mm_castsi128_ps(w_q31product_even), _mm_castsi128_ps(w_q31product_odd), _MM_SHUFFLE(2, 0, 2, 0)));
+
+ const __m128i x_q31product = _mm_shuffle_epi32(x_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i y_q31product = _mm_shuffle_epi32(y_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i z_q31product = _mm_shuffle_epi32(z_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+ const __m128i w_q31product = _mm_shuffle_epi32(w_q31product_0213, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m128i x_remainder =
+ _mm_add_epi32(_mm_and_si128(x_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), x_q31product));
+ const __m128i y_remainder =
+ _mm_add_epi32(_mm_and_si128(y_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), y_q31product));
+ const __m128i z_remainder =
+ _mm_add_epi32(_mm_and_si128(z_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), z_q31product));
+ const __m128i w_remainder =
+ _mm_add_epi32(_mm_and_si128(w_q31product, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), w_q31product));
+
+ const __m128i x_scaled =
+ _mm_sub_epi32(_mm_sra_epi32(x_q31product, vshift), _mm_cmpgt_epi32(x_remainder, vthreshold));
+ const __m128i y_scaled =
+ _mm_sub_epi32(_mm_sra_epi32(y_q31product, vshift), _mm_cmpgt_epi32(y_remainder, vthreshold));
+ const __m128i z_scaled =
+ _mm_sub_epi32(_mm_sra_epi32(z_q31product, vshift), _mm_cmpgt_epi32(z_remainder, vthreshold));
+ const __m128i w_scaled =
+ _mm_sub_epi32(_mm_sra_epi32(w_q31product, vshift), _mm_cmpgt_epi32(w_remainder, vthreshold));
+
+ const __m128i xy_packed = _mm_adds_epi16(_mm_packs_epi32(x_scaled, y_scaled), vzero_point);
+ const __m128i zw_packed = _mm_adds_epi16(_mm_packs_epi32(z_scaled, w_scaled), vzero_point);
+ const __m128i xy_clamped = _mm_max_epi16(_mm_min_epi16(xy_packed, vqmax), vqmin);
+ const __m128i zw_clamped = _mm_max_epi16(_mm_min_epi16(zw_packed, vqmax), vqmin);
+ const __m128i xyzw_clamped = _mm_packs_epi16(xy_clamped, zw_clamped);
+
+ // 16x PSHUFD
+ // 4x SHUFPS
+ // 8x PMULUDQ
+ // 8x PXOR (setzero)
+ // 8x PXOR
+ // 4x PAND
+ // 8x PADDQ
+ // 4x PADDD
+ // 8x PSUBQ
+ // 4x PSUBD
+ // 8x PSRLQ (immediate)
+ // 4x PSRAD (register)
+ // 12x PCMPGTD
+ // 4x PABSD
+ // 2x PACKSSDW
+ // 2x PADDSW
+ // 2x PMAXSW
+ // 2x PMINSW
+ // 1x PACKSSWB
+ // ---------------------
+ // 109 instructions total
+
+ _mm_storeu_si128((__m128i*) output, xyzw_clamped);
+ output += 16;
+ }
+}
diff --git a/src/xnnpack/requantization-stubs.h b/src/xnnpack/requantization-stubs.h
index bab892b..64f24ae 100644
--- a/src/xnnpack/requantization-stubs.h
+++ b/src/xnnpack/requantization-stubs.h
@@ -17,6 +17,7 @@
extern "C" {
#endif
+
typedef void (*xnn_qu8_requantization_function)(
size_t n,
const int32_t* input,
@@ -59,11 +60,50 @@
DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_q31__neon)
DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_q31__psimd)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__scalar)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__sse2)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__ssse3)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__sse4)
-DECLARE_QU8_REQUANTIZATION_FUNCTION(xnn_qu8_requantize_gemmlowp__neon)
+
+typedef void (*xnn_qs8_requantization_function)(
+ size_t n,
+ const int32_t* input,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax,
+ int8_t* output);
+
+#define DECLARE_QS8_REQUANTIZATION_FUNCTION(fn_name) \
+ void fn_name( \
+ size_t n, \
+ const int32_t* input, \
+ float scale, \
+ int8_t zero_point, \
+ int8_t qmin, \
+ int8_t qmax, \
+ int8_t* output);
+
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__scalar_unsigned32)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__scalar_unsigned64)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__scalar_signed64)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__sse2)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__ssse3)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__sse4)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__neon)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_precise__psimd)
+
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__scalar_lrintf)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__scalar_magic)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__sse2)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__sse4)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__neon)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__psimd)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_fp32__wasmsimd)
+
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_q31__scalar)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_q31__sse2)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_q31__ssse3)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_q31__sse4)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_q31__neon)
+DECLARE_QS8_REQUANTIZATION_FUNCTION(xnn_qs8_requantize_q31__psimd)
+
#ifdef __cplusplus
} // extern "C"
diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
index ecfbe9a..50c6d1d 100644
--- a/src/xnnpack/requantization.h
+++ b/src/xnnpack/requantization.h
@@ -45,6 +45,102 @@
return (uint8_t) (n + params.scalar.zero_point);
}
+inline static uint8_t xnn_qu8_requantize_precise(
+ int32_t value,
+ float scale,
+ uint8_t zero_point,
+ uint8_t qmin,
+ uint8_t qmax)
+{
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const uint32_t scale_bits = fp32_to_bits(scale);
+ const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
+ const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+ assert(shift >= 24);
+ assert(shift < 56);
+
+ // Compute absolute value of input as unsigned 32-bit int.
+ // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
+ const uint32_t abs_value = (value >= 0) ? (uint32_t) value : -(uint32_t) value;
+
+ // Compute full 64-bit product of 32-bit factors
+ const uint64_t product = (uint64_t) abs_value * (uint64_t) multiplier;
+
+ // Shift the full 64-bit product right with rounding.
+ // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+ const uint64_t rounding = UINT64_C(1) << (shift - 1);
+ const uint32_t abs_scaled_value = (uint32_t) ((product + rounding) >> shift);
+
+ // Copy the sign of input to scaled absolute input value.
+ const int32_t scaled_value = (int32_t) (value >= 0 ? abs_scaled_value : -abs_scaled_value);
+
+ // Clamp scaled value with zero point between smin and smax.
+ int32_t clamped_value = scaled_value;
+ const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point;
+ if (clamped_value < smin) {
+ clamped_value = smin;
+ }
+ const int32_t smax = (int32_t) (uint32_t) qmax - (int32_t) (uint32_t) zero_point;
+ if (clamped_value > smax) {
+ clamped_value = smax;
+ }
+
+ // Add zero point to clamped value.
+ const int32_t biased_value = clamped_value + (int32_t) (uint32_t) zero_point;
+
+ return biased_value;
+}
+
+inline static int8_t xnn_qs8_requantize_precise(
+ int32_t value,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax)
+{
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
+
+ const uint32_t scale_bits = fp32_to_bits(scale);
+ const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
+ const uint32_t shift = 127 + 23 - (scale_bits >> 23);
+ assert(shift >= 24);
+ assert(shift < 56);
+
+ // Compute absolute value of input as unsigned 32-bit int.
+ // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
+ const uint32_t abs_value = (value >= 0) ? (uint32_t) value : -(uint32_t) value;
+
+ // Compute full 64-bit product of 32-bit factors
+ const uint64_t product = (uint64_t) abs_value * (uint64_t) multiplier;
+
+ // Shift the full 64-bit product right with rounding.
+ // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
+ const uint64_t rounding = UINT64_C(1) << (shift - 1);
+ const uint32_t abs_scaled_value = (uint32_t) ((product + rounding) >> shift);
+
+ // Copy the sign of input to scaled absolute input value.
+ const int32_t scaled_value = (int32_t) (value >= 0 ? abs_scaled_value : -abs_scaled_value);
+
+ // Clamp scaled value with zero point between smin and smax.
+ int32_t clamped_value = scaled_value;
+ const int32_t smin = (int32_t) qmin - (int32_t) zero_point;
+ if (clamped_value < smin) {
+ clamped_value = smin;
+ }
+ const int32_t smax = (int32_t) qmax - (int32_t) zero_point;
+ if (clamped_value > smax) {
+ clamped_value = smax;
+ }
+
+ // Add zero point to clamped value.
+ const int32_t biased_value = clamped_value + (int32_t) zero_point;
+
+ return biased_value;
+}
+
static inline uint8_t xnn_qu8_quantize_avgpool(
int32_t n,
union xnn_qu8_avgpool_params params)
diff --git a/src/xnnpack/scalar-utils.h b/src/xnnpack/scalar-utils.h
index dab3f82..341a4ae 100644
--- a/src/xnnpack/scalar-utils.h
+++ b/src/xnnpack/scalar-utils.h
@@ -68,51 +68,3 @@
return x >> n;
#endif
}
-
-inline static uint8_t scalar_requantize_precise(
- int32_t value,
- float scale,
- uint8_t zero_point,
- uint8_t qmin,
- uint8_t qmax)
-{
- assert(scale < 1.0f);
- assert(scale >= 0x1.0p-32f);
-
- const uint32_t scale_bits = fp32_to_bits(scale);
- const uint32_t multiplier = (scale_bits & UINT32_C(0x007FFFFF)) | UINT32_C(0x00800000);
- const uint32_t shift = 127 + 23 - (scale_bits >> 23);
- assert(shift >= 24);
- assert(shift < 56);
-
- // Compute absolute value of input as unsigned 32-bit int.
- // All further computations will work with unsigned values to avoid undefined behaviour on signed operations.
- const uint32_t abs_value = (value >= 0) ? (uint32_t) value : -(uint32_t) value;
-
- // Compute full 64-bit product of 32-bit factors
- const uint64_t product = (uint64_t) abs_value * (uint64_t) multiplier;
-
- // Shift the full 64-bit product right with rounding.
- // Rounding is performed towards closest integer, with midpoints rounded up (same as away from zero).
- const uint64_t rounding = UINT64_C(1) << (shift - 1);
- const uint32_t abs_scaled_value = (uint32_t) ((product + rounding) >> shift);
-
- // Copy the sign of input to scaled absolute input value.
- const int32_t scaled_value = (int32_t) (value >= 0 ? abs_scaled_value : -abs_scaled_value);
-
- // Clamp scaled value with zero point between smin and smax.
- int32_t clamped_value = scaled_value;
- const int32_t smin = (int32_t) (uint32_t) qmin - (int32_t) (uint32_t) zero_point;
- if (clamped_value < smin) {
- clamped_value = smin;
- }
- const int32_t smax = (int32_t) (uint32_t) qmax - (int32_t) (uint32_t) zero_point;
- if (clamped_value > smax) {
- clamped_value = smax;
- }
-
- // Add zero point to clamped value.
- const int32_t biased_value = clamped_value + (int32_t) (uint32_t) zero_point;
-
- return biased_value;
-}
diff --git a/test/qs8-requantization.cc b/test/qs8-requantization.cc
new file mode 100644
index 0000000..ed90f4e
--- /dev/null
+++ b/test/qs8-requantization.cc
@@ -0,0 +1,1192 @@
+// Copyright (c) Facebook, Inc. and its affiliates.
+// All rights reserved.
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+
+#include <xnnpack/requantization-stubs.h>
+#include "requantization-tester.h"
+
+
+/*
+ * Precise scalar implementation using unsigned 32-bit arithmetics.
+ */
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__scalar_unsigned32);
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__scalar_unsigned32);
+ }
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__scalar_unsigned32);
+ }
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_down) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__scalar_unsigned32);
+ }
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, divide_by_po2_with_rounding_away) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__scalar_unsigned32);
+ }
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_precise__scalar_unsigned32);
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED32, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesPrecise(xnn_qs8_requantize_precise__scalar_unsigned32);
+}
+
+
+/*
+ * Precise scalar implementation using unsigned 64-bit arithmetics.
+ */
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__scalar_unsigned64);
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__scalar_unsigned64);
+ }
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__scalar_unsigned64);
+ }
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_down) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__scalar_unsigned64);
+ }
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, divide_by_po2_with_rounding_away) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__scalar_unsigned64);
+ }
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_precise__scalar_unsigned64);
+}
+
+TEST(QS8_PRECISE__SCALAR_UNSIGNED64, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesPrecise(xnn_qs8_requantize_precise__scalar_unsigned64);
+}
+
+
+/*
+ * Precise scalar implementation using signed 64-bit arithmetics.
+ */
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__scalar_signed64);
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__scalar_signed64);
+ }
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__scalar_signed64);
+ }
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, divide_by_po2_with_rounding_down) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__scalar_signed64);
+ }
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, divide_by_po2_with_rounding_away) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__scalar_signed64);
+ }
+ }
+}
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_precise__scalar_signed64);
+}
+
+TEST(QS8_PRECISE__SCALAR_SIGNED64, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesPrecise(xnn_qs8_requantize_precise__scalar_signed64);
+}
+
+
+/*
+ * FP32-based scalar implementation using lrintf function.
+ */
+
+TEST(QS8_FP32__SCALAR_LRINTF, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(1000)
+ .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_lrintf);
+}
+
+
+/*
+ * FP32-based scalar implementation using magic trick for FP32->INT32 conversion.
+ */
+
+TEST(QS8_FP32__SCALAR_MAGIC, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(1000)
+ .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__scalar_magic);
+}
+
+
+/*
+ * Q31-based scalar implementation.
+ */
+
+TEST(QS8_Q31__SCALAR, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_q31__scalar);
+ }
+}
+
+TEST(QS8_Q31__SCALAR, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_q31__scalar);
+ }
+ }
+}
+
+TEST(QS8_Q31__SCALAR, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_q31__scalar);
+ }
+ }
+}
+
+/* No rounding down test - it fails because of upward bias in multiplication */
+/* No rounding away test - it fails because of upward bias in multiplication */
+
+TEST(QS8_Q31__SCALAR, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_q31__scalar);
+}
+
+TEST(QS8_Q31__SCALAR, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesApproximate(xnn_qs8_requantize_q31__scalar);
+}
+
+
+#if !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
+ /*
+ * Precise PSIMD implementation using unsigned 32-bit arithmetics.
+ */
+
+ TEST(QS8_PRECISE__PSIMD, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__psimd);
+ }
+ }
+
+ TEST(QS8_PRECISE__PSIMD, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__psimd);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__PSIMD, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__psimd);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__PSIMD, divide_by_po2_with_rounding_down) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__psimd);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__PSIMD, divide_by_po2_with_rounding_away) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__psimd);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__PSIMD, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_precise__psimd);
+ }
+
+ TEST(QS8_PRECISE__PSIMD, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesPrecise(xnn_qs8_requantize_precise__psimd);
+ }
+
+
+ /*
+ * FP32-based PSIMD implementation using magic trick for FP32->INT32 conversion.
+ */
+
+ TEST(QS8_FP32__PSIMD, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(1000)
+ .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__psimd);
+ }
+#endif // !XNN_ARCH_WASM && !XNN_COMPILER_MSVC && !XNN_COMPILER_ICC
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ /*
+ * Precise SSE2 implementation using floating-point shuffle.
+ */
+
+ TEST(QS8_PRECISE__SSE2, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__sse2);
+ }
+ }
+
+ TEST(QS8_PRECISE__SSE2, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__sse2);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__SSE2, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__sse2);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__SSE2, divide_by_po2_with_rounding_down) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__sse2);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__SSE2, divide_by_po2_with_rounding_away) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__sse2);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__SSE2, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_precise__sse2);
+ }
+
+ TEST(QS8_PRECISE__SSE2, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesPrecise(xnn_qs8_requantize_precise__sse2);
+ }
+
+
+ /*
+ * Precise SSSE3 implementation using floating-point shuffle.
+ */
+
+ TEST(QS8_PRECISE__SSSE3, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__ssse3);
+ }
+ }
+
+ TEST(QS8_PRECISE__SSSE3, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__ssse3);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__SSSE3, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__ssse3);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__SSSE3, divide_by_po2_with_rounding_down) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__ssse3);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__SSSE3, divide_by_po2_with_rounding_away) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__ssse3);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__SSSE3, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_precise__ssse3);
+ }
+
+ TEST(QS8_PRECISE__SSSE3, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesPrecise(xnn_qs8_requantize_precise__ssse3);
+ }
+
+
+ /*
+ * Precise SSE4.1 implementation using static blend instruction.
+ */
+
+ TEST(QS8_PRECISE__SSE4, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__sse4);
+ }
+ }
+
+ TEST(QS8_PRECISE__SSE4, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__sse4);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__SSE4, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__sse4);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__SSE4, divide_by_po2_with_rounding_down) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__sse4);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__SSE4, divide_by_po2_with_rounding_away) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__sse4);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__SSE4, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_precise__sse4);
+ }
+
+ TEST(QS8_PRECISE__SSE4, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesPrecise(xnn_qs8_requantize_precise__sse4);
+ }
+
+
+ /*
+ * FP32-based x86 SSE2 implementation.
+ */
+
+ TEST(QS8_FP32__SSE2, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(1000)
+ .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__sse2);
+ }
+
+
+ /*
+ * FP32-based x86 SSE4 implementation.
+ */
+
+ TEST(QS8_FP32__SSE4, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(1000)
+ .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__sse4);
+ }
+
+
+ /*
+ * Q31-based x86 SSE2 implementation.
+ */
+
+ TEST(QS8_Q31__SSE2, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_q31__sse2);
+ }
+ }
+
+ TEST(QS8_Q31__SSE2, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_q31__sse2);
+ }
+ }
+ }
+
+ TEST(QS8_Q31__SSE2, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_q31__sse2);
+ }
+ }
+ }
+
+ /* No rounding down test - it fails because of upward bias in multiplication */
+ /* No rounding away test - it fails because of upward bias in multiplication */
+
+ TEST(QS8_Q31__SSE2, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_q31__sse2);
+ }
+
+ TEST(QS8_Q31__SSE2, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesApproximate(xnn_qs8_requantize_q31__sse2);
+ }
+
+
+ /*
+ * Q31-based x86 SSSE3 implementation.
+ */
+
+ TEST(QS8_Q31__SSSE3, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_q31__ssse3);
+ }
+ }
+
+ TEST(QS8_Q31__SSSE3, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_q31__ssse3);
+ }
+ }
+ }
+
+ TEST(QS8_Q31__SSSE3, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_q31__ssse3);
+ }
+ }
+ }
+
+ /* No rounding down test - it fails because of upward bias in multiplication */
+ /* No rounding away test - it fails because of upward bias in multiplication */
+
+ TEST(QS8_Q31__SSSE3, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_q31__ssse3);
+ }
+
+ TEST(QS8_Q31__SSSE3, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesApproximate(xnn_qs8_requantize_q31__ssse3);
+ }
+
+
+ /*
+ * Q31-based x86 SSE4 implementation.
+ */
+
+ TEST(QS8_Q31__SSE4, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_q31__sse4);
+ }
+ }
+
+ TEST(QS8_Q31__SSE4, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_q31__sse4);
+ }
+ }
+ }
+
+ TEST(QS8_Q31__SSE4, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_q31__sse4);
+ }
+ }
+ }
+
+ /* No rounding down test - it fails because of upward bias in multiplication */
+ /* No rounding away test - it fails because of upward bias in multiplication */
+
+ TEST(QS8_Q31__SSE4, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_q31__sse4);
+ }
+
+ TEST(QS8_Q31__SSE4, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesApproximate(xnn_qs8_requantize_q31__sse4);
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ /*
+ * Precise ARM NEON implementation.
+ */
+
+ TEST(QS8_PRECISE__NEON, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .s(s)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__neon);
+ }
+ }
+
+ TEST(QS8_PRECISE__NEON, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_precise__neon);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__NEON, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_precise__neon);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__NEON, divide_by_po2_with_rounding_down) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingDown(xnn_qs8_requantize_precise__neon);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__NEON, divide_by_po2_with_rounding_away) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingAway(xnn_qs8_requantize_precise__neon);
+ }
+ }
+ }
+
+ TEST(QS8_PRECISE__NEON, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_precise__neon);
+ }
+
+ TEST(QS8_PRECISE__NEON, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesPrecise(xnn_qs8_requantize_precise__neon);
+ }
+
+
+ /*
+ * FP32-based ARM NEON implementation.
+ */
+
+ TEST(QS8_FP32__NEON, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(1000)
+ .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__neon);
+ }
+
+
+ /*
+ * Q31-based ARM NEON implementation.
+ */
+
+ TEST(QS8_Q31__NEON, exact_divide_by_po2) {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_q31__neon);
+ }
+ }
+
+ TEST(QS8_Q31__NEON, exact_divide_by_po2_with_zero_point) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestExactDivideByPO2(xnn_qs8_requantize_q31__neon);
+ }
+ }
+ }
+
+ TEST(QS8_Q31__NEON, divide_by_po2_with_rounding_up) {
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ for (uint32_t s = 1; s < 32; s++) {
+ RequantizationTester()
+ .zero_point(zero_point)
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .s(s)
+ .TestDivideByPO2WithRoundingUp(xnn_qs8_requantize_q31__neon);
+ }
+ }
+ }
+
+ /* No rounding down test - it fails because of upward bias in multiplication */
+ /* No rounding away test - it fails because of upward bias in multiplication */
+
+ TEST(QS8_Q31__NEON, special_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .TestSpecialCases(xnn_qs8_requantize_q31__neon);
+ }
+
+ TEST(QS8_Q31__NEON, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(100)
+ .TestRandomCasesApproximate(xnn_qs8_requantize_q31__neon);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+#if XNN_ARCH_WASMSIMD
+ /*
+ * FP32-based ARM NEON implementation.
+ */
+
+ TEST(QS8_FP32__WASMSIMD, random_cases) {
+ RequantizationTester()
+ .qmin(std::numeric_limits<int8_t>::min())
+ .qmax(std::numeric_limits<int8_t>::max())
+ .iterations(1000)
+ .TestRandomCasesApproximate(xnn_qs8_requantize_fp32__wasmsimd);
+ }
+#endif // XNN_ARCH_WASMSIMD
diff --git a/test/qu8-requantization.cc b/test/qu8-requantization.cc
index 53cb190..bfcf313 100644
--- a/test/qu8-requantization.cc
+++ b/test/qu8-requantization.cc
@@ -25,6 +25,8 @@
TEST(QU8_PRECISE__SCALAR_UNSIGNED32, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__scalar_unsigned32);
}
@@ -35,6 +37,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__scalar_unsigned32);
}
@@ -46,6 +50,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__scalar_unsigned32);
}
@@ -57,6 +63,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__scalar_unsigned32);
}
@@ -68,6 +76,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__scalar_unsigned32);
}
@@ -76,11 +86,16 @@
TEST(QU8_PRECISE__SCALAR_UNSIGNED32, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_precise__scalar_unsigned32);
}
TEST(QU8_PRECISE__SCALAR_UNSIGNED32, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .zero_point(128)
.iterations(100)
.TestRandomCasesPrecise(xnn_qu8_requantize_precise__scalar_unsigned32);
}
@@ -93,6 +108,8 @@
TEST(QU8_PRECISE__SCALAR_UNSIGNED64, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__scalar_unsigned64);
}
@@ -103,6 +120,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__scalar_unsigned64);
}
@@ -114,6 +133,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__scalar_unsigned64);
}
@@ -125,6 +146,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__scalar_unsigned64);
}
@@ -136,6 +159,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__scalar_unsigned64);
}
@@ -144,11 +169,16 @@
TEST(QU8_PRECISE__SCALAR_UNSIGNED64, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_precise__scalar_unsigned64);
}
TEST(QU8_PRECISE__SCALAR_UNSIGNED64, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .zero_point(128)
.iterations(100)
.TestRandomCasesPrecise(xnn_qu8_requantize_precise__scalar_unsigned64);
}
@@ -161,6 +191,8 @@
TEST(QU8_PRECISE__SCALAR_SIGNED64, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__scalar_signed64);
}
@@ -171,6 +203,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__scalar_signed64);
}
@@ -182,6 +216,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__scalar_signed64);
}
@@ -193,6 +229,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__scalar_signed64);
}
@@ -204,6 +242,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__scalar_signed64);
}
@@ -212,11 +252,16 @@
TEST(QU8_PRECISE__SCALAR_SIGNED64, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_precise__scalar_signed64);
}
TEST(QU8_PRECISE__SCALAR_SIGNED64, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .zero_point(128)
.iterations(100)
.TestRandomCasesPrecise(xnn_qu8_requantize_precise__scalar_signed64);
}
@@ -228,6 +273,8 @@
TEST(QU8_FP32__SCALAR_LRINTF, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.iterations(1000)
.TestRandomCasesApproximate(xnn_qu8_requantize_fp32__scalar_lrintf);
}
@@ -239,6 +286,8 @@
TEST(QU8_FP32__SCALAR_MAGIC, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.iterations(1000)
.TestRandomCasesApproximate(xnn_qu8_requantize_fp32__scalar_magic);
}
@@ -251,6 +300,8 @@
TEST(QU8_Q31__SCALAR, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_q31__scalar);
}
@@ -261,6 +312,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_q31__scalar);
}
@@ -272,32 +325,28 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_q31__scalar);
}
}
}
-/* No rounding down Test - it fails because of upward bias in multiplication */
-
-TEST(QU8_Q31__SCALAR, divide_by_po2_with_rounding_away) {
- for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
- for (uint32_t s = 1; s < 32; s++) {
- RequantizationTester()
- .zero_point(zero_point)
- .s(s)
- .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_q31__scalar);
- }
- }
-}
+/* No rounding down test - it fails because of upward bias in multiplication */
+/* No rounding away test - it fails because of upward bias in multiplication */
TEST(QU8_Q31__SCALAR, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_q31__scalar);
}
TEST(QU8_Q31__SCALAR, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.iterations(100)
.TestRandomCasesApproximate(xnn_qu8_requantize_q31__scalar);
}
@@ -311,6 +360,8 @@
TEST(QU8_PRECISE__PSIMD, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__psimd);
}
@@ -321,6 +372,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__psimd);
}
@@ -332,6 +385,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__psimd);
}
@@ -343,6 +398,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__psimd);
}
@@ -354,6 +411,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__psimd);
}
@@ -362,11 +421,16 @@
TEST(QU8_PRECISE__PSIMD, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_precise__psimd);
}
TEST(QU8_PRECISE__PSIMD, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .zero_point(128)
.iterations(100)
.TestRandomCasesPrecise(xnn_qu8_requantize_precise__psimd);
}
@@ -378,6 +442,8 @@
TEST(QU8_FP32__PSIMD, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.iterations(1000)
.TestRandomCasesApproximate(xnn_qu8_requantize_fp32__psimd);
}
@@ -392,6 +458,8 @@
TEST(QU8_PRECISE__SSE2, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__sse2);
}
@@ -402,6 +470,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__sse2);
}
@@ -413,6 +483,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__sse2);
}
@@ -424,6 +496,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__sse2);
}
@@ -435,6 +509,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__sse2);
}
@@ -443,11 +519,16 @@
TEST(QU8_PRECISE__SSE2, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_precise__sse2);
}
TEST(QU8_PRECISE__SSE2, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .zero_point(128)
.iterations(100)
.TestRandomCasesPrecise(xnn_qu8_requantize_precise__sse2);
}
@@ -460,6 +541,8 @@
TEST(QU8_PRECISE__SSSE3, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__ssse3);
}
@@ -470,6 +553,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__ssse3);
}
@@ -481,6 +566,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__ssse3);
}
@@ -492,6 +579,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__ssse3);
}
@@ -503,6 +592,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__ssse3);
}
@@ -511,11 +602,16 @@
TEST(QU8_PRECISE__SSSE3, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_precise__ssse3);
}
TEST(QU8_PRECISE__SSSE3, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .zero_point(128)
.iterations(100)
.TestRandomCasesPrecise(xnn_qu8_requantize_precise__ssse3);
}
@@ -528,6 +624,8 @@
TEST(QU8_PRECISE__SSE4, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__sse4);
}
@@ -538,6 +636,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__sse4);
}
@@ -549,6 +649,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__sse4);
}
@@ -560,6 +662,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__sse4);
}
@@ -571,6 +675,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__sse4);
}
@@ -579,11 +685,16 @@
TEST(QU8_PRECISE__SSE4, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_precise__sse4);
}
TEST(QU8_PRECISE__SSE4, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .zero_point(128)
.iterations(100)
.TestRandomCasesPrecise(xnn_qu8_requantize_precise__sse4);
}
@@ -595,6 +706,8 @@
TEST(QU8_FP32__SSE2, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.iterations(1000)
.TestRandomCasesApproximate(xnn_qu8_requantize_fp32__sse2);
}
@@ -607,6 +720,8 @@
TEST(QU8_Q31__SSE2, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_q31__sse2);
}
@@ -617,6 +732,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_q31__sse2);
}
@@ -628,32 +745,28 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_q31__sse2);
}
}
}
- /* No rounding down Test - it fails because of upward bias in multiplication */
-
- TEST(QU8_Q31__SSE2, divide_by_po2_with_rounding_away) {
- for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
- for (uint32_t s = 1; s < 32; s++) {
- RequantizationTester()
- .zero_point(zero_point)
- .s(s)
- .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_q31__sse2);
- }
- }
- }
+ /* No rounding down test - it fails because of upward bias in multiplication */
+ /* No rounding away test - it fails because of upward bias in multiplication */
TEST(QU8_Q31__SSE2, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_q31__sse2);
}
TEST(QU8_Q31__SSE2, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.iterations(100)
.TestRandomCasesApproximate(xnn_qu8_requantize_q31__sse2);
}
@@ -666,6 +779,8 @@
TEST(QU8_Q31__SSSE3, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_q31__ssse3);
}
@@ -676,6 +791,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_q31__ssse3);
}
@@ -687,32 +804,28 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_q31__ssse3);
}
}
}
- /* No rounding down Test - it fails because of upward bias in multiplication */
-
- TEST(QU8_Q31__SSSE3, divide_by_po2_with_rounding_away) {
- for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
- for (uint32_t s = 1; s < 32; s++) {
- RequantizationTester()
- .zero_point(zero_point)
- .s(s)
- .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_q31__ssse3);
- }
- }
- }
+ /* No rounding down test - it fails because of upward bias in multiplication */
+ /* No rounding away test - it fails because of upward bias in multiplication */
TEST(QU8_Q31__SSSE3, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_q31__ssse3);
}
TEST(QU8_Q31__SSSE3, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.iterations(100)
.TestRandomCasesApproximate(xnn_qu8_requantize_q31__ssse3);
}
@@ -725,6 +838,8 @@
TEST(QU8_Q31__SSE4, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_q31__sse4);
}
@@ -735,6 +850,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_q31__sse4);
}
@@ -746,32 +863,28 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_q31__sse4);
}
}
}
- /* No rounding down Test - it fails because of upward bias in multiplication */
-
- TEST(QU8_Q31__SSE4, divide_by_po2_with_rounding_away) {
- for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
- for (uint32_t s = 1; s < 32; s++) {
- RequantizationTester()
- .zero_point(zero_point)
- .s(s)
- .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_q31__sse4);
- }
- }
- }
+ /* No rounding down test - it fails because of upward bias in multiplication */
+ /* No rounding away test - it fails because of upward bias in multiplication */
TEST(QU8_Q31__SSE4, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_q31__sse4);
}
TEST(QU8_Q31__SSE4, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.iterations(100)
.TestRandomCasesApproximate(xnn_qu8_requantize_q31__sse4);
}
@@ -785,6 +898,8 @@
TEST(QU8_PRECISE__NEON, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__neon);
}
@@ -795,6 +910,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_precise__neon);
}
@@ -806,6 +923,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_precise__neon);
}
@@ -817,6 +936,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingDown(xnn_qu8_requantize_precise__neon);
}
@@ -828,6 +949,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_precise__neon);
}
@@ -836,11 +959,16 @@
TEST(QU8_PRECISE__NEON, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_precise__neon);
}
TEST(QU8_PRECISE__NEON, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
+ .zero_point(128)
.iterations(100)
.TestRandomCasesPrecise(xnn_qu8_requantize_precise__neon);
}
@@ -852,6 +980,8 @@
TEST(QU8_FP32__NEON, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.iterations(1000)
.TestRandomCasesApproximate(xnn_qu8_requantize_fp32__neon);
}
@@ -864,6 +994,8 @@
TEST(QU8_Q31__NEON, exact_divide_by_po2) {
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_q31__neon);
}
@@ -874,6 +1006,8 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestExactDivideByPO2(xnn_qu8_requantize_q31__neon);
}
@@ -885,32 +1019,28 @@
for (uint32_t s = 1; s < 32; s++) {
RequantizationTester()
.zero_point(zero_point)
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.s(s)
.TestDivideByPO2WithRoundingUp(xnn_qu8_requantize_q31__neon);
}
}
}
- /* No rounding down Test - it fails because of upward bias in multiplication */
-
- TEST(QU8_Q31__NEON, divide_by_po2_with_rounding_away) {
- for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
- for (uint32_t s = 1; s < 32; s++) {
- RequantizationTester()
- .zero_point(zero_point)
- .s(s)
- .TestDivideByPO2WithRoundingAway(xnn_qu8_requantize_q31__neon);
- }
- }
- }
+ /* No rounding down test - it fails because of upward bias in multiplication */
+ /* No rounding away test - it fails because of upward bias in multiplication */
TEST(QU8_Q31__NEON, special_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.TestSpecialCases(xnn_qu8_requantize_q31__neon);
}
TEST(QU8_Q31__NEON, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.iterations(100)
.TestRandomCasesApproximate(xnn_qu8_requantize_q31__neon);
}
@@ -923,6 +1053,8 @@
TEST(QU8_FP32__WASMSIMD, random_cases) {
RequantizationTester()
+ .qmin(std::numeric_limits<uint8_t>::min())
+ .qmax(std::numeric_limits<uint8_t>::max())
.iterations(1000)
.TestRandomCasesApproximate(xnn_qu8_requantize_fp32__wasmsimd);
}
diff --git a/test/requantization-tester.h b/test/requantization-tester.h
index 8e34ac3..766dad7 100644
--- a/test/requantization-tester.h
+++ b/test/requantization-tester.h
@@ -22,7 +22,7 @@
#include <xnnpack/params.h>
#include <xnnpack/requantization-stubs.h>
-#include <xnnpack/scalar-utils.h>
+#include <xnnpack/requantization.h>
class RequantizationTester {
@@ -49,21 +49,21 @@
return this->zero_point_;
}
- inline RequantizationTester& qmin(uint8_t qmin) {
+ inline RequantizationTester& qmin(int16_t qmin) {
this->qmin_ = qmin;
return *this;
}
- inline uint8_t qmin() const {
+ inline int16_t qmin() const {
return this->qmin_;
}
- inline RequantizationTester& qmax(uint8_t qmax) {
+ inline RequantizationTester& qmax(int16_t qmax) {
this->qmax_ = qmax;
return *this;
}
- inline uint8_t qmax() const {
+ inline int16_t qmax() const {
return this->qmax_;
}
@@ -84,8 +84,13 @@
* produces exactly i, provided that ((i - zero point) * 2**s) does not overflow.
*/
void TestExactDivideByPO2(xnn_qu8_requantization_function requantize) const {
- ASSERT_GE(zero_point(), 0);
- ASSERT_LE(zero_point(), 255);
+ ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+ ASSERT_LT(qmin(), qmax());
/* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
ASSERT_GE(s(), 1);
@@ -93,20 +98,61 @@
std::vector<int32_t> inputs(256);
std::vector<uint8_t> outputs(inputs.size());
- const int32_t maxI = (uint32_t(std::numeric_limits<int32_t>::max()) >> s()) + zero_point();
- const int32_t minI = -(-uint32_t(std::numeric_limits<int32_t>::min()) >> s()) + zero_point();
- for (int32_t i = 0; i < 256; i++) {
- const int32_t clampedI = std::max(minI, std::min(maxI, i));
- inputs[i] = int32_t(uint32_t(clampedI - zero_point()) << s());
+ const int32_t max_i = (uint32_t(std::numeric_limits<int32_t>::max()) >> s()) + zero_point();
+ const int32_t min_i = -(-uint32_t(std::numeric_limits<int32_t>::min()) >> s()) + zero_point();
+ for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
+ const int32_t clamped_i = std::max(min_i, std::min(max_i, i));
+ inputs[i] = int32_t(uint32_t(clamped_i - zero_point()) << s());
}
requantize(inputs.size(), inputs.data(),
scale(), zero_point(), qmin(), qmax(),
outputs.data());
- for (int32_t i = 0; i < 256; i++) {
- const int32_t clampedI = std::max(minI, std::min(maxI, i));
- ASSERT_EQ(clampedI, outputs[i]) << "i = " << i << ", clamped i = " << clampedI <<
- ", min i = " << minI << ", max i = " << maxI <<
- ", s = " << s() << ", zero point = " << zero_point();
+ for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
+ const int32_t clamped_i = std::max(min_i, std::min(max_i, i));
+ ASSERT_EQ(uint32_t(clamped_i), uint32_t(outputs[i]))
+ << "i = " << i << ", clamped i = " << clamped_i
+ << ", min i = " << min_i << ", max i = " << max_i
+ << ", s = " << s() << ", zero point = " << zero_point();
+ }
+ }
+
+ /*
+ * Test that requantization of numbers ((i - zero point) * 2**s) with
+ * - scale = exp2(-s)
+ * - zero point in [-128, 127]
+ * - no output clamping
+ * produces exactly i, provided that ((i - zero point) * 2**s) does not overflow.
+ */
+ void TestExactDivideByPO2(xnn_qs8_requantization_function requantize) const {
+ ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+ ASSERT_LT(qmin(), qmax());
+
+ /* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
+ ASSERT_GE(s(), 1);
+ ASSERT_LT(s(), 32);
+
+ std::vector<int32_t> inputs(256);
+ std::vector<int8_t> outputs(inputs.size());
+ const int32_t max_i = (uint32_t(std::numeric_limits<int32_t>::max()) >> s()) + zero_point();
+ const int32_t min_i = -(-uint32_t(std::numeric_limits<int32_t>::min()) >> s()) + zero_point();
+ for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+ const int32_t clamped_i = std::max(min_i, std::min(max_i, i));
+ inputs[i - std::numeric_limits<int8_t>::min()] = int32_t(uint32_t(clamped_i - zero_point()) << s());
+ }
+ requantize(inputs.size(), inputs.data(),
+ scale(), zero_point(), qmin(), qmax(),
+ outputs.data());
+ for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+ const int32_t clamped_i = std::max(min_i, std::min(max_i, i));
+ ASSERT_EQ(clamped_i, int32_t(outputs[i - std::numeric_limits<int8_t>::min()]))
+ << "i = " << i << ", clamped i = " << clamped_i
+ << ", min i = " << min_i << ", max i = " << max_i
+ << ", s = " << s() << ", zero point = " << zero_point();
}
}
@@ -118,8 +164,13 @@
* produces exactly i, provided that ((i - zero point) * 2**s) does not overflow.
*/
void TestDivideByPO2WithRoundingUp(xnn_qu8_requantization_function requantize) {
- ASSERT_GE(zero_point(), 0);
- ASSERT_LE(zero_point(), 255);
+ ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+ ASSERT_LT(qmin(), qmax());
/* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
ASSERT_GE(s(), 1);
@@ -127,7 +178,7 @@
std::vector<int32_t> inputs(256);
std::vector<uint8_t> outputs(inputs.size());
- for (int32_t i = 0; i < 256; i++) {
+ for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) -
(INT64_C(1) << (s() - 1)) + (int64_t) (i <= zero_point());
inputs[i] = int32_t(input);
@@ -135,12 +186,54 @@
requantize(inputs.size(), inputs.data(),
scale(), zero_point(), qmin(), qmax(),
outputs.data());
- for (int32_t i = 0; i < 256; i++) {
+ for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) -
(INT64_C(1) << (s() - 1)) + (int64_t) (i <= zero_point());
if (int32_t(input) == input) {
- ASSERT_EQ(i, uint32_t(outputs[i])) << "i = " << i << ", input = " << input <<
- ", s = " << s() << ", zero point = " << zero_point();
+ ASSERT_EQ(i, int32_t(outputs[i]))
+ << "i = " << i << ", input = " << input
+ << ", s = " << s() << ", zero point = " << zero_point();
+ }
+ }
+ }
+
+ /*
+ * Test that requantization of numbers (i * 2**s + sign(i - zero point) * 2**(s-1)) with
+ * - scale = exp2(-s)
+ * - zero point in [-128, 127]
+ * - no output clamping
+ * produces exactly i, provided that ((i - zero point) * 2**s) does not overflow.
+ */
+ void TestDivideByPO2WithRoundingUp(xnn_qs8_requantization_function requantize) {
+ ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+ ASSERT_LT(qmin(), qmax());
+
+ /* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
+ ASSERT_GE(s(), 1);
+ ASSERT_LT(s(), 32);
+
+ std::vector<int32_t> inputs(256);
+ std::vector<int8_t> outputs(inputs.size());
+ for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+ const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) -
+ (INT64_C(1) << (s() - 1)) + (int64_t) (i <= zero_point());
+ inputs[i - std::numeric_limits<int8_t>::min()] = int32_t(input);
+ }
+ requantize(inputs.size(), inputs.data(),
+ scale(), zero_point(), qmin(), qmax(),
+ outputs.data());
+ for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+ const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) -
+ (INT64_C(1) << (s() - 1)) + (int64_t) (i <= zero_point());
+ if (int32_t(input) == input) {
+ ASSERT_EQ(i, int32_t(outputs[i - std::numeric_limits<int8_t>::min()]))
+ << "i = " << i << ", input = " << input
+ << ", s = " << s() << ", zero point = " << zero_point();
}
}
}
@@ -153,8 +246,13 @@
* produces exactly i, provided that ((i - zero point) * 2**s) does not overflow.
*/
void TestDivideByPO2WithRoundingDown(xnn_qu8_requantization_function requantize) {
- ASSERT_GE(zero_point(), 0);
- ASSERT_LE(zero_point(), 255);
+ ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+ ASSERT_LT(qmin(), qmax());
/* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
ASSERT_GE(s(), 1);
@@ -162,7 +260,7 @@
std::vector<int32_t> inputs(256);
std::vector<uint8_t> outputs(inputs.size());
- for (int32_t i = 0; i < 256; i++) {
+ for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) +
(INT64_C(1) << (s() - 1)) - (int64_t) (i >= zero_point());
inputs[i] = int32_t(input);
@@ -170,19 +268,66 @@
requantize(inputs.size(), inputs.data(),
scale(), zero_point(), qmin(), qmax(),
outputs.data());
- for (int32_t i = 0; i < 256; i++) {
+ for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) +
(INT64_C(1) << (s() - 1)) - (int64_t) (i >= zero_point());
if (int32_t(input) == input) {
- ASSERT_EQ(i, uint32_t(outputs[i])) << "i = " << i << ", input = " << input <<
- ", s = " << s() << ", zero point = " << zero_point();
+ ASSERT_EQ(i, int32_t(outputs[i]))
+ << "i = " << i << ", input = " << input
+ << ", s = " << s() << ", zero point = " << zero_point();
+ }
+ }
+ }
+
+ /*
+ * Test that requantization of numbers (i * 2**s + sign(i - zero point) * 2**(s-1)) with
+ * - scale = exp2(-s)
+ * - zero point in [-128, 127]
+ * - no output clamping
+ * produces exactly i, provided that ((i - zero point) * 2**s) does not overflow.
+ */
+ void TestDivideByPO2WithRoundingDown(xnn_qs8_requantization_function requantize) {
+ ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+ ASSERT_LT(qmin(), qmax());
+
+ /* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
+ ASSERT_GE(s(), 1);
+ ASSERT_LT(s(), 32);
+
+ std::vector<int32_t> inputs(256);
+ std::vector<int8_t> outputs(inputs.size());
+ for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+ const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) +
+ (INT64_C(1) << (s() - 1)) - (int64_t) (i >= zero_point());
+ inputs[i - std::numeric_limits<int8_t>::min()] = int32_t(input);
+ }
+ requantize(inputs.size(), inputs.data(),
+ scale(), zero_point(), qmin(), qmax(),
+ outputs.data());
+ for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+ const int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s()) +
+ (INT64_C(1) << (s() - 1)) - (int64_t) (i >= zero_point());
+ if (int32_t(input) == input) {
+ ASSERT_EQ(i, int32_t(outputs[i - std::numeric_limits<int8_t>::min()]))
+ << "i = " << i << ", input = " << input
+ << ", s = " << s() << ", zero point = " << zero_point();
}
}
}
void TestDivideByPO2WithRoundingAway(xnn_qu8_requantization_function requantize) {
- ASSERT_GE(zero_point(), 0);
- ASSERT_LE(zero_point(), 255);
+ ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+ ASSERT_LT(qmin(), qmax());
/* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
ASSERT_GE(s(), 1);
@@ -190,7 +335,7 @@
std::vector<int32_t> inputs(256);
std::vector<uint8_t> outputs(inputs.size());
- for (int32_t i = 0; i < 256; i++) {
+ for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s());
if (input > 0) {
input -= INT64_C(1) << (s() - 1);
@@ -202,7 +347,7 @@
requantize(inputs.size(), inputs.data(),
scale(), zero_point(), qmin(), qmax(),
outputs.data());
- for (uint32_t i = 0; i < 256; i++) {
+ for (int32_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i++) {
int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s());
if (input > 0) {
input -= INT64_C(1) << (s() - 1);
@@ -210,18 +355,67 @@
input += INT64_C(1) << (s() - 1);
}
if (int32_t(input) == input) {
- ASSERT_EQ(i, uint32_t(outputs[i])) << "i = " << i << ", input = " << input <<
- ", s = " << s() << ", zero point = " << zero_point();
+ ASSERT_EQ(i, int32_t(outputs[i]))
+ << "i = " << i << ", input = " << input
+ << ", s = " << s() << ", zero point = " << zero_point();
+ }
+ }
+ }
+
+ void TestDivideByPO2WithRoundingAway(xnn_qs8_requantization_function requantize) {
+ ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+ ASSERT_LT(qmin(), qmax());
+
+ /* Note: need s >= 1 to ensure scale = exp2(-s) < 1.0 */
+ ASSERT_GE(s(), 1);
+ ASSERT_LT(s(), 32);
+
+ std::vector<int32_t> inputs(256);
+ std::vector<int8_t> outputs(inputs.size());
+ for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+ int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s());
+ if (input > 0) {
+ input -= INT64_C(1) << (s() - 1);
+ } else if (input < 0) {
+ input += INT64_C(1) << (s() - 1);
+ }
+ inputs[i - std::numeric_limits<int8_t>::min()] = int32_t(input);
+ }
+ requantize(inputs.size(), inputs.data(),
+ scale(), zero_point(), qmin(), qmax(),
+ outputs.data());
+ for (int32_t i = std::numeric_limits<int8_t>::min(); i <= std::numeric_limits<int8_t>::max(); i++) {
+ int64_t input = RequantizationTester::ShiftLeft(i - zero_point(), s());
+ if (input > 0) {
+ input -= INT64_C(1) << (s() - 1);
+ } else if (input < 0) {
+ input += INT64_C(1) << (s() - 1);
+ }
+ if (int32_t(input) == input) {
+ ASSERT_EQ(i, int32_t(outputs[i - std::numeric_limits<int8_t>::min()]))
+ << "i = " << i << ", input = " << input
+ << ", s = " << s() << ", zero point = " << zero_point();
}
}
}
void TestSpecialCases(xnn_qu8_requantization_function requantize) {
+ ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+ ASSERT_LT(qmin(), qmax());
+
std::vector<int32_t> inputs(256);
std::vector<uint8_t> outputs(inputs.size());
std::fill(inputs.begin(), inputs.end(), std::numeric_limits<int32_t>::min());
- for (int32_t zero_point = 0; zero_point < 256; zero_point++) {
+ for (int32_t zero_point = 0; zero_point <= std::numeric_limits<uint8_t>::max(); zero_point++) {
requantize(
inputs.size(),
inputs.data(),
@@ -230,7 +424,9 @@
std::numeric_limits<uint8_t>::min(),
std::numeric_limits<uint8_t>::max(),
outputs.data());
- ASSERT_EQ(std::max(int32_t(0), zero_point - 1), *std::min_element(outputs.cbegin(), outputs.cend()));
+ for (size_t i = 0; i < outputs.size(); i++) {
+ ASSERT_EQ(std::max(int32_t(int32_t(std::numeric_limits<uint8_t>::min())), zero_point - 1), int32_t(outputs[i]));
+ }
}
std::fill(inputs.begin(), inputs.end(), std::numeric_limits<int32_t>::max());
@@ -242,73 +438,167 @@
std::numeric_limits<uint8_t>::min(),
std::numeric_limits<uint8_t>::max(),
outputs.data());
- for (size_t i = 0; i < inputs.size(); i++) {
- ASSERT_EQ(std::numeric_limits<uint8_t>::max(), outputs[i]);
+ for (size_t i = 0; i < outputs.size(); i++) {
+ ASSERT_EQ(std::numeric_limits<uint8_t>::max(), int32_t(outputs[i]));
+ }
+ }
+
+ void TestSpecialCases(xnn_qs8_requantization_function requantize) {
+ ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+ ASSERT_LT(qmin(), qmax());
+
+ std::vector<int32_t> inputs(256);
+ std::vector<int8_t> outputs(inputs.size());
+
+ std::fill(inputs.begin(), inputs.end(), std::numeric_limits<int32_t>::min());
+ for (int32_t zero_point = std::numeric_limits<int8_t>::min();
+ zero_point <= std::numeric_limits<int8_t>::max();
+ zero_point++)
+ {
+ requantize(
+ inputs.size(),
+ inputs.data(),
+ ldexpf(1.0f, -32) /* scale */,
+ zero_point,
+ std::numeric_limits<int8_t>::min(),
+ std::numeric_limits<int8_t>::max(),
+ outputs.data());
+ for (size_t i = 0; i < outputs.size(); i++) {
+ ASSERT_EQ(std::max(int32_t(std::numeric_limits<int8_t>::min()), zero_point - 1), int32_t(outputs[i]));
+ }
+ }
+
+ std::fill(inputs.begin(), inputs.end(), std::numeric_limits<int32_t>::max());
+ requantize(
+ inputs.size(),
+ inputs.data(),
+ 0x1.FFFFFEp-1f /* scale */,
+ std::numeric_limits<int8_t>::max() /* zero point */,
+ std::numeric_limits<int8_t>::min(),
+ std::numeric_limits<int8_t>::max(),
+ outputs.data());
+ for (size_t i = 0; i < outputs.size(); i++) {
+ ASSERT_EQ(std::numeric_limits<int8_t>::max(), int32_t(outputs[i]));
}
}
void TestRandomCasesPrecise(xnn_qu8_requantization_function requantize) {
+ ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+ ASSERT_LT(qmin(), qmax());
+
std::random_device random_device;
std::mt19937 rng(random_device());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
- auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
+ auto u8rng =
+ std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
std::vector<int32_t> inputs(4096);
std::vector<uint8_t> outputs(inputs.size());
- const uint8_t zero_point = UINT8_C(128);
std::uniform_real_distribution<float> scale_distribution(0x1.000000p-23f, 0x1.FFFFFEp-1f);
const float scale = scale_distribution(rng);
for (size_t i = 0; i < inputs.size(); i++) {
- const uint8_t approximate_output = u8rng();
+ const uint8_t approximate_output = std::min(std::max(uint8_t(u8rng()), uint8_t(qmin())), uint8_t(qmax()));
const int32_t input = int32_t(double(approximate_output) / double(scale));
inputs[i] = input;
}
requantize(
- inputs.size(), inputs.data(), scale, zero_point,
- std::numeric_limits<uint8_t>::min(),
- std::numeric_limits<uint8_t>::max(),
+ inputs.size(), inputs.data(), scale, zero_point(), qmin(), qmax(),
outputs.data());
- /* Ensure that outputs are not all identical, as in this case Test doesn't validate much */
+ /* Ensure that outputs are not all identical, as in this case the test doesn't validate much */
ASSERT_NE(
*std::max_element(outputs.cbegin(), outputs.cend()),
*std::min_element(outputs.cbegin(), outputs.cend()));
for (size_t i = 0; i < inputs.size(); i++) {
const uint8_t reference_output =
- scalar_requantize_precise(
- inputs[i], scale, zero_point,
- std::numeric_limits<uint8_t>::min(),
- std::numeric_limits<uint8_t>::max());
+ xnn_qu8_requantize_precise(inputs[i], scale, zero_point(), qmin(), qmax());
ASSERT_EQ(uint32_t(reference_output), uint32_t(outputs[i]));
}
}
}
- void TestRandomCasesApproximate(xnn_qu8_requantization_function requantize) {
+ void TestRandomCasesPrecise(xnn_qs8_requantization_function requantize) {
+ ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+ ASSERT_LT(qmin(), qmax());
+
std::random_device random_device;
std::mt19937 rng(random_device());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
- auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
+ auto s8rng = std::bind(
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));
std::vector<int32_t> inputs(4096);
- std::vector<uint8_t> outputs(inputs.size());
+ std::vector<int8_t> outputs(inputs.size());
- const uint8_t zero_point = UINT8_C(128);
std::uniform_real_distribution<float> scale_distribution(0x1.000000p-23f, 0x1.FFFFFEp-1f);
const float scale = scale_distribution(rng);
for (size_t i = 0; i < inputs.size(); i++) {
- const uint8_t approximate_output = u8rng();
+ const int8_t approximate_output = std::min(std::max(int8_t(s8rng()), int8_t(qmin())), int8_t(qmax()));
const int32_t input = int32_t(double(approximate_output) / double(scale));
inputs[i] = input;
}
requantize(
- inputs.size(), inputs.data(), scale, zero_point,
- std::numeric_limits<uint8_t>::min(),
- std::numeric_limits<uint8_t>::max(),
+ inputs.size(), inputs.data(), scale, zero_point(), qmin(), qmax(),
+ outputs.data());
+
+ /* Ensure that outputs are not all identical, as in this case the test doesn't validate much */
+ ASSERT_NE(
+ *std::max_element(outputs.cbegin(), outputs.cend()),
+ *std::min_element(outputs.cbegin(), outputs.cend()));
+
+ for (size_t i = 0; i < inputs.size(); i++) {
+ const int8_t reference_output =
+ xnn_qs8_requantize_precise(inputs[i], scale, zero_point(), qmin(), qmax());
+ ASSERT_EQ(int32_t(reference_output), int32_t(outputs[i]));
+ }
+ }
+ }
+
+ void TestRandomCasesApproximate(xnn_qu8_requantization_function requantize) {
+ ASSERT_GE(zero_point(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(zero_point(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmin(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<uint8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<uint8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<uint8_t>::max());
+ ASSERT_LT(qmin(), qmax());
+
+ std::random_device random_device;
+ std::mt19937 rng(random_device());
+ for (size_t iteration = 0; iteration < iterations(); iteration++) {
+ auto u8rng =
+ std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), std::ref(rng));
+
+ std::vector<int32_t> inputs(4096);
+ std::vector<uint8_t> outputs(inputs.size());
+
+ std::uniform_real_distribution<float> scale_distribution(0x1.000000p-23f, 0x1.FFFFFEp-1f);
+ const float scale = scale_distribution(rng);
+ for (size_t i = 0; i < inputs.size(); i++) {
+ const uint8_t approximate_output = std::min(std::max(uint8_t(u8rng()), uint8_t(qmin())), uint8_t(qmax()));
+ const int32_t input = int32_t(double(approximate_output) / double(scale));
+ inputs[i] = input;
+ }
+
+ requantize(
+ inputs.size(), inputs.data(), scale, zero_point(), qmin(), qmax(),
outputs.data());
/* Ensure that outputs are not all identical, as in this case Test doesn't validate much */
@@ -317,14 +607,56 @@
*std::min_element(outputs.cbegin(), outputs.cend()));
for (size_t i = 0; i < inputs.size(); i++) {
- const double reference_output =
- RequantizationTester::RequantizeApproximate(
- inputs[i], scale, zero_point,
- std::numeric_limits<uint8_t>::min(),
- std::numeric_limits<uint8_t>::max());
- ASSERT_LE(fabs(reference_output - double(outputs[i])), 0.55) <<
- "input = " << inputs[i] <<
- ", output = " << uint32_t(outputs[i]) << ", reference output = " << reference_output;
+ const double reference_output = RequantizationTester::RequantizeApproximate(
+ inputs[i], scale, uint8_t(zero_point()), uint8_t(qmin()), uint8_t(qmax()));
+ ASSERT_LE(std::abs(reference_output - double(outputs[i])), 0.55)
+ << "input = " << inputs[i] << ", output = " << int32_t(outputs[i])
+ << ", reference output = " << reference_output;
+ }
+ }
+ }
+
+ void TestRandomCasesApproximate(xnn_qs8_requantization_function requantize) {
+ ASSERT_GE(zero_point(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(zero_point(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmin(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmin(), std::numeric_limits<int8_t>::max());
+ ASSERT_GE(qmax(), std::numeric_limits<int8_t>::min());
+ ASSERT_LE(qmax(), std::numeric_limits<int8_t>::max());
+ ASSERT_LT(qmin(), qmax());
+
+ std::random_device random_device;
+ std::mt19937 rng(random_device());
+ for (size_t iteration = 0; iteration < iterations(); iteration++) {
+ auto s8rng = std::bind(
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), std::ref(rng));
+
+ std::vector<int32_t> inputs(4096);
+ std::vector<int8_t> outputs(inputs.size());
+
+ std::uniform_real_distribution<float> scale_distribution(0x1.000000p-23f, 0x1.FFFFFEp-1f);
+ const float scale = scale_distribution(rng);
+ for (size_t i = 0; i < inputs.size(); i++) {
+ const int8_t approximate_output = std::min(std::max(int8_t(s8rng()), int8_t(qmin())), int8_t(qmax()));
+ const int32_t input = int32_t(double(approximate_output) / double(scale));
+ inputs[i] = input;
+ }
+
+ requantize(
+ inputs.size(), inputs.data(), scale, zero_point(), qmin(), qmax(),
+ outputs.data());
+
+ /* Ensure that outputs are not all identical, as in this case Test doesn't validate much */
+ ASSERT_NE(
+ *std::max_element(outputs.cbegin(), outputs.cend()),
+ *std::min_element(outputs.cbegin(), outputs.cend()));
+
+ for (size_t i = 0; i < inputs.size(); i++) {
+ const double reference_output = RequantizationTester::RequantizeApproximate(
+ inputs[i], scale, int8_t(zero_point()), int8_t(qmin()), int8_t(qmax()));
+ ASSERT_LE(std::abs(reference_output - double(outputs[i])), 0.55)
+ << "input = " << inputs[i] << ", output = " << int32_t(outputs[i])
+ << ", reference output = " << reference_output;
}
}
}
@@ -343,25 +675,26 @@
assert(scale < 1.0f);
assert(scale >= 0x1.0p-32f);
- double clamped_value = double(value) * double(scale) + double(zero_point);
+ return std::min(std::max(double(value) * double(scale) + double(zero_point), double(qmin)), double(qmax));
+ }
- const double fmin = double(qmin);
- if (clamped_value < fmin) {
- clamped_value = fmin;
- }
+ static inline double RequantizeApproximate(
+ int32_t value,
+ float scale,
+ int8_t zero_point,
+ int8_t qmin,
+ int8_t qmax)
+ {
+ assert(scale < 1.0f);
+ assert(scale >= 0x1.0p-32f);
- const double fmax = double(qmax);
- if (clamped_value > fmax) {
- clamped_value = fmax;
- }
-
- return clamped_value;
+ return std::min(std::max(double(value) * double(scale) + double(zero_point), double(qmin)), double(qmax));
}
private:
- size_t zero_point_{0};
- size_t s_{1};
- uint8_t qmin_{std::numeric_limits<uint8_t>::min()};
- uint8_t qmax_{std::numeric_limits<uint8_t>::max()};
+ uint32_t s_{1};
+ int32_t zero_point_{0};
+ int16_t qmin_{std::numeric_limits<int16_t>::min()};
+ int16_t qmax_{std::numeric_limits<int16_t>::max()};
size_t iterations_{1};
};