QS8 ADD microkernels in SSE2 and SSE4.1 implementations
PiperOrigin-RevId: 325862938
diff --git a/BUILD.bazel b/BUILD.bazel
index f5f5a93..c207f75 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1792,6 +1792,10 @@
"src/qs8-requantization/precise-sse2.c",
"src/qs8-requantization/fp32-sse2.c",
"src/qs8-requantization/q31-sse2.c",
+ "src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c",
+ "src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x16.c",
+ "src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x24.c",
+ "src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x32.c",
"src/qu8-avgpool/9p8x-minmax-sse2-c8.c",
"src/qu8-avgpool/9x-minmax-sse2-c8.c",
"src/qu8-igemm/4x4c2-minmax-sse2.c",
@@ -1922,6 +1926,10 @@
"src/qs8-requantization/fp32-sse4.c",
"src/qs8-requantization/precise-sse4.c",
"src/qs8-requantization/q31-sse4.c",
+ "src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c",
+ "src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x16.c",
+ "src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x24.c",
+ "src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x32.c",
"src/qu8-requantization/precise-sse4.c",
"src/qu8-requantization/q31-sse4.c",
"src/math/roundne-sse41.c",
@@ -5534,6 +5542,15 @@
)
xnnpack_unit_test(
+ name = "qs8_vadd_minmax_test",
+ srcs = [
+ "test/qs8-vadd-minmax.cc",
+ "test/vadd-microkernel-tester.h",
+ ] + MICROKERNEL_TEST_HDRS,
+ deps = MICROKERNEL_TEST_DEPS,
+)
+
+xnnpack_unit_test(
name = "qu8_avgpool_minmax_test",
srcs = [
"test/qu8-avgpool-minmax.cc",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9497263..be1fee6 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1361,6 +1361,10 @@
src/qs8-requantization/fp32-sse2.c
src/qs8-requantization/precise-sse2.c
src/qs8-requantization/q31-sse2.c
+ src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
+ src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x16.c
+ src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x24.c
+ src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x32.c
src/qu8-avgpool/9p8x-minmax-sse2-c8.c
src/qu8-avgpool/9x-minmax-sse2-c8.c
src/qu8-igemm/4x4c2-minmax-sse2.c
@@ -1489,6 +1493,10 @@
src/qs8-requantization/fp32-sse4.c
src/qs8-requantization/precise-sse4.c
src/qs8-requantization/q31-sse4.c
+ src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c
+ src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x16.c
+ src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x24.c
+ src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x32.c
src/qu8-requantization/precise-sse4.c
src/qu8-requantization/q31-sse4.c
src/math/roundne-sse41.c
diff --git a/scripts/generate-f16-vbinary.sh b/scripts/generate-f16-vbinary.sh
index 491cdfa..b44636f 100755
--- a/scripts/generate-f16-vbinary.sh
+++ b/scripts/generate-f16-vbinary.sh
@@ -40,19 +40,19 @@
tools/xngen src/f16-vbinary/vopc-neonfp16arith.c.in -D OP=RSUB -D BATCH_TILE=16 -D ACTIVATION=MINMAX -o src/f16-vbinary/gen/vrsubc-minmax-neonfp16arith-x16.c
################################## Unit tests #################################
-tools/generate-vbinary-test.py --spec test/f16-vadd-minmax.yaml --output test/f16-vadd-minmax.cc
-tools/generate-vbinary-test.py --spec test/f16-vdiv-minmax.yaml --output test/f16-vdiv-minmax.cc
-tools/generate-vbinary-test.py --spec test/f16-vmax.yaml --output test/f16-vmax.cc
-tools/generate-vbinary-test.py --spec test/f16-vmin.yaml --output test/f16-vmin.cc
-tools/generate-vbinary-test.py --spec test/f16-vmul-minmax.yaml --output test/f16-vmul-minmax.cc
-tools/generate-vbinary-test.py --spec test/f16-vsqrdiff.yaml --output test/f16-vsqrdiff.cc
-tools/generate-vbinary-test.py --spec test/f16-vsub-minmax.yaml --output test/f16-vsub-minmax.cc
-tools/generate-vbinary-test.py --spec test/f16-vaddc-minmax.yaml --output test/f16-vaddc-minmax.cc
-tools/generate-vbinary-test.py --spec test/f16-vdivc-minmax.yaml --output test/f16-vdivc-minmax.cc
-tools/generate-vbinary-test.py --spec test/f16-vrdivc-minmax.yaml --output test/f16-vrdivc-minmax.cc
-tools/generate-vbinary-test.py --spec test/f16-vmaxc.yaml --output test/f16-vmaxc.cc
-tools/generate-vbinary-test.py --spec test/f16-vminc.yaml --output test/f16-vminc.cc
-tools/generate-vbinary-test.py --spec test/f16-vmulc-minmax.yaml --output test/f16-vmulc-minmax.cc
-tools/generate-vbinary-test.py --spec test/f16-vsqrdiffc.yaml --output test/f16-vsqrdiffc.cc
-tools/generate-vbinary-test.py --spec test/f16-vsubc-minmax.yaml --output test/f16-vsubc-minmax.cc
-tools/generate-vbinary-test.py --spec test/f16-vrsubc-minmax.yaml --output test/f16-vrsubc-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f16-vadd-minmax.yaml --output test/f16-vadd-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f16-vdiv-minmax.yaml --output test/f16-vdiv-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f16-vmax.yaml --output test/f16-vmax.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f16-vmin.yaml --output test/f16-vmin.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f16-vmul-minmax.yaml --output test/f16-vmul-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f16-vsqrdiff.yaml --output test/f16-vsqrdiff.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f16-vsub-minmax.yaml --output test/f16-vsub-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f16-vaddc-minmax.yaml --output test/f16-vaddc-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f16-vdivc-minmax.yaml --output test/f16-vdivc-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f16-vrdivc-minmax.yaml --output test/f16-vrdivc-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f16-vmaxc.yaml --output test/f16-vmaxc.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f16-vminc.yaml --output test/f16-vminc.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f16-vmulc-minmax.yaml --output test/f16-vmulc-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f16-vsqrdiffc.yaml --output test/f16-vsqrdiffc.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f16-vsubc-minmax.yaml --output test/f16-vsubc-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f16-vrsubc-minmax.yaml --output test/f16-vrsubc-minmax.cc
diff --git a/scripts/generate-f32-vbinary.sh b/scripts/generate-f32-vbinary.sh
index bbd59c6..c0ecaae 100755
--- a/scripts/generate-f32-vbinary.sh
+++ b/scripts/generate-f32-vbinary.sh
@@ -446,41 +446,40 @@
tools/xngen src/f32-vbinary/vopc-avx512f.c.in -D OP=SUB -D BATCH_TILE=32 -D ACTIVATION=MINMAX -o src/f32-vbinary/gen/vsubc-minmax-avx512f-x32.c
################################## Unit tests #################################
-tools/generate-vbinary-test.py --spec test/f32-vadd-minmax.yaml --output test/f32-vadd-minmax.cc
-tools/generate-vbinary-test.py --spec test/f32-vadd-relu.yaml --output test/f32-vadd-relu.cc
-tools/generate-vbinary-test.py --spec test/f32-vadd.yaml --output test/f32-vadd.cc
-tools/generate-vbinary-test.py --spec test/f32-vdiv-minmax.yaml --output test/f32-vdiv-minmax.cc
-tools/generate-vbinary-test.py --spec test/f32-vdiv-relu.yaml --output test/f32-vdiv-relu.cc
-tools/generate-vbinary-test.py --spec test/f32-vdiv.yaml --output test/f32-vdiv.cc
-tools/generate-vbinary-test.py --spec test/f32-vmax.yaml --output test/f32-vmax.cc
-tools/generate-vbinary-test.py --spec test/f32-vmin.yaml --output test/f32-vmin.cc
-tools/generate-vbinary-test.py --spec test/f32-vmul-minmax.yaml --output test/f32-vmul-minmax.cc
-tools/generate-vbinary-test.py --spec test/f32-vmul-relu.yaml --output test/f32-vmul-relu.cc
-tools/generate-vbinary-test.py --spec test/f32-vmul.yaml --output test/f32-vmul.cc
-tools/generate-vbinary-test.py --spec test/f32-vsqrdiff.yaml --output test/f32-vsqrdiff.cc
-tools/generate-vbinary-test.py --spec test/f32-vsub-minmax.yaml --output test/f32-vsub-minmax.cc
-tools/generate-vbinary-test.py --spec test/f32-vsub-relu.yaml --output test/f32-vsub-relu.cc
-tools/generate-vbinary-test.py --spec test/f32-vsub.yaml --output test/f32-vsub.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vadd-minmax.yaml --output test/f32-vadd-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vadd-relu.yaml --output test/f32-vadd-relu.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vadd.yaml --output test/f32-vadd.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vdiv-minmax.yaml --output test/f32-vdiv-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vdiv-relu.yaml --output test/f32-vdiv-relu.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vdiv.yaml --output test/f32-vdiv.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vmax.yaml --output test/f32-vmax.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vmin.yaml --output test/f32-vmin.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vmul-minmax.yaml --output test/f32-vmul-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vmul-relu.yaml --output test/f32-vmul-relu.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vmul.yaml --output test/f32-vmul.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vsqrdiff.yaml --output test/f32-vsqrdiff.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vsub-minmax.yaml --output test/f32-vsub-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vsub-relu.yaml --output test/f32-vsub-relu.cc
+tools/generate-vbinary-test.py --tester VBinOpMicrokernelTester --spec test/f32-vsub.yaml --output test/f32-vsub.cc
-tools/generate-vbinary-test.py --spec test/f32-vaddc-minmax.yaml --output test/f32-vaddc-minmax.cc
-tools/generate-vbinary-test.py --spec test/f32-vaddc-relu.yaml --output test/f32-vaddc-relu.cc
-tools/generate-vbinary-test.py --spec test/f32-vaddc.yaml --output test/f32-vaddc.cc
-tools/generate-vbinary-test.py --spec test/f32-vdivc-minmax.yaml --output test/f32-vdivc-minmax.cc
-tools/generate-vbinary-test.py --spec test/f32-vdivc-relu.yaml --output test/f32-vdivc-relu.cc
-tools/generate-vbinary-test.py --spec test/f32-vdivc.yaml --output test/f32-vdivc.cc
-tools/generate-vbinary-test.py --spec test/f32-vmaxc.yaml --output test/f32-vmaxc.cc
-tools/generate-vbinary-test.py --spec test/f32-vminc.yaml --output test/f32-vminc.cc
-tools/generate-vbinary-test.py --spec test/f32-vmulc-minmax.yaml --output test/f32-vmulc-minmax.cc
-tools/generate-vbinary-test.py --spec test/f32-vmulc-relu.yaml --output test/f32-vmulc-relu.cc
-tools/generate-vbinary-test.py --spec test/f32-vmulc.yaml --output test/f32-vmulc.cc
-tools/generate-vbinary-test.py --spec test/f32-vrdivc-minmax.yaml --output test/f32-vrdivc-minmax.cc
-tools/generate-vbinary-test.py --spec test/f32-vrdivc-relu.yaml --output test/f32-vrdivc-relu.cc
-tools/generate-vbinary-test.py --spec test/f32-vrdivc.yaml --output test/f32-vrdivc.cc
-tools/generate-vbinary-test.py --spec test/f32-vrsubc-minmax.yaml --output test/f32-vrsubc-minmax.cc
-tools/generate-vbinary-test.py --spec test/f32-vrsubc-relu.yaml --output test/f32-vrsubc-relu.cc
-tools/generate-vbinary-test.py --spec test/f32-vrsubc.yaml --output test/f32-vrsubc.cc
-tools/generate-vbinary-test.py --spec test/f32-vsqrdiffc.yaml --output test/f32-vsqrdiffc.cc
-tools/generate-vbinary-test.py --spec test/f32-vsqrdiffc.yaml --output test/f32-vsqrdiffc.cc
-tools/generate-vbinary-test.py --spec test/f32-vsubc-minmax.yaml --output test/f32-vsubc-minmax.cc
-tools/generate-vbinary-test.py --spec test/f32-vsubc-relu.yaml --output test/f32-vsubc-relu.cc
-tools/generate-vbinary-test.py --spec test/f32-vsubc.yaml --output test/f32-vsubc.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vaddc-minmax.yaml --output test/f32-vaddc-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vaddc-relu.yaml --output test/f32-vaddc-relu.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vaddc.yaml --output test/f32-vaddc.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vdivc-minmax.yaml --output test/f32-vdivc-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vdivc-relu.yaml --output test/f32-vdivc-relu.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vdivc.yaml --output test/f32-vdivc.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vmaxc.yaml --output test/f32-vmaxc.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vminc.yaml --output test/f32-vminc.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vmulc-minmax.yaml --output test/f32-vmulc-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vmulc-relu.yaml --output test/f32-vmulc-relu.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vmulc.yaml --output test/f32-vmulc.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vrdivc-minmax.yaml --output test/f32-vrdivc-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vrdivc-relu.yaml --output test/f32-vrdivc-relu.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vrdivc.yaml --output test/f32-vrdivc.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vrsubc-minmax.yaml --output test/f32-vrsubc-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vrsubc-relu.yaml --output test/f32-vrsubc-relu.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vrsubc.yaml --output test/f32-vrsubc.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vsqrdiffc.yaml --output test/f32-vsqrdiffc.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vsubc-minmax.yaml --output test/f32-vsubc-minmax.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vsubc-relu.yaml --output test/f32-vsubc-relu.cc
+tools/generate-vbinary-test.py --tester VBinOpCMicrokernelTester --spec test/f32-vsubc.yaml --output test/f32-vsubc.cc
diff --git a/scripts/generate-qs8-vadd.sh b/scripts/generate-qs8-vadd.sh
new file mode 100755
index 0000000..7d89b40
--- /dev/null
+++ b/scripts/generate-qs8-vadd.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+# Copyright 2020 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+################################### x86 SSE ###################################
+tools/xngen src/qs8-vadd/sse-mul16-ld64.c.in -D BATCH_TILE=8 -D SSE=2 -o src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
+tools/xngen src/qs8-vadd/sse-mul16-ld64.c.in -D BATCH_TILE=16 -D SSE=2 -o src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x16.c
+tools/xngen src/qs8-vadd/sse-mul16-ld64.c.in -D BATCH_TILE=24 -D SSE=2 -o src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x24.c
+tools/xngen src/qs8-vadd/sse-mul16-ld64.c.in -D BATCH_TILE=32 -D SSE=2 -o src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x32.c
+
+tools/xngen src/qs8-vadd/sse-mul16-ld64.c.in -D BATCH_TILE=8 -D SSE=4 -o src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c
+tools/xngen src/qs8-vadd/sse-mul16-ld64.c.in -D BATCH_TILE=16 -D SSE=4 -o src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x16.c
+tools/xngen src/qs8-vadd/sse-mul16-ld64.c.in -D BATCH_TILE=24 -D SSE=4 -o src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x24.c
+tools/xngen src/qs8-vadd/sse-mul16-ld64.c.in -D BATCH_TILE=32 -D SSE=4 -o src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x32.c
+
+################################## Unit tests #################################
+tools/generate-vbinary-test.py --tester VAddMicrokernelTester --spec test/qs8-vadd-minmax.yaml --output test/qs8-vadd-minmax.cc
diff --git a/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x4.c b/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x4.c
index 049b6e9..9306ed1 100644
--- a/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x4.c
+++ b/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x4.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x8.c b/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x8.c
index c654389..bdc6f79 100644
--- a/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x8.c
+++ b/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-arm-x8.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x4.c b/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x4.c
index a2ce9d6..eb0cc44 100644
--- a/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x4.c
+++ b/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x4.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x8.c b/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x8.c
index 2c53f3c..5add499 100644
--- a/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x8.c
+++ b/src/f32-vbinary/gen/vaddc-minmax-wasmsimd-x86-x8.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vaddc-wasmsimd-x4.c b/src/f32-vbinary/gen/vaddc-wasmsimd-x4.c
index 9e3b56a..d69a0cd 100644
--- a/src/f32-vbinary/gen/vaddc-wasmsimd-x4.c
+++ b/src/f32-vbinary/gen/vaddc-wasmsimd-x4.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vaddc-wasmsimd-x8.c b/src/f32-vbinary/gen/vaddc-wasmsimd-x8.c
index c8e772b..63ac66c 100644
--- a/src/f32-vbinary/gen/vaddc-wasmsimd-x8.c
+++ b/src/f32-vbinary/gen/vaddc-wasmsimd-x8.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x4.c b/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x4.c
index 4037443..e09c4d5 100644
--- a/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x4.c
+++ b/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x4.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x8.c b/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x8.c
index 9d6de46..fff7534 100644
--- a/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x8.c
+++ b/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-arm-x8.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x4.c b/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x4.c
index 9f2d6bf..2b41e6e 100644
--- a/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x4.c
+++ b/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x4.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x8.c b/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x8.c
index 18bd5e7..560e2e7 100644
--- a/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x8.c
+++ b/src/f32-vbinary/gen/vdivc-minmax-wasmsimd-x86-x8.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vdivc-wasmsimd-x4.c b/src/f32-vbinary/gen/vdivc-wasmsimd-x4.c
index e3bfe9b..b20595b 100644
--- a/src/f32-vbinary/gen/vdivc-wasmsimd-x4.c
+++ b/src/f32-vbinary/gen/vdivc-wasmsimd-x4.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vdivc-wasmsimd-x8.c b/src/f32-vbinary/gen/vdivc-wasmsimd-x8.c
index c9043b5..bf0375d 100644
--- a/src/f32-vbinary/gen/vdivc-wasmsimd-x8.c
+++ b/src/f32-vbinary/gen/vdivc-wasmsimd-x8.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x4.c b/src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x4.c
index 1aa9ada..7734b4a 100644
--- a/src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x4.c
+++ b/src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x4.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x8.c b/src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x8.c
index 347d060..216208c 100644
--- a/src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x8.c
+++ b/src/f32-vbinary/gen/vmaxc-wasmsimd-arm-x8.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x4.c b/src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x4.c
index d111b57..04cdd57 100644
--- a/src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x4.c
+++ b/src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x4.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x8.c b/src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x8.c
index 1325b6c..8491c45 100644
--- a/src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x8.c
+++ b/src/f32-vbinary/gen/vmaxc-wasmsimd-x86-x8.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vminc-wasmsimd-arm-x4.c b/src/f32-vbinary/gen/vminc-wasmsimd-arm-x4.c
index 40c3f33..844c072 100644
--- a/src/f32-vbinary/gen/vminc-wasmsimd-arm-x4.c
+++ b/src/f32-vbinary/gen/vminc-wasmsimd-arm-x4.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vminc-wasmsimd-arm-x8.c b/src/f32-vbinary/gen/vminc-wasmsimd-arm-x8.c
index bdd7c63..a819043 100644
--- a/src/f32-vbinary/gen/vminc-wasmsimd-arm-x8.c
+++ b/src/f32-vbinary/gen/vminc-wasmsimd-arm-x8.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vminc-wasmsimd-x86-x4.c b/src/f32-vbinary/gen/vminc-wasmsimd-x86-x4.c
index 463cea3..c845cfa 100644
--- a/src/f32-vbinary/gen/vminc-wasmsimd-x86-x4.c
+++ b/src/f32-vbinary/gen/vminc-wasmsimd-x86-x4.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vminc-wasmsimd-x86-x8.c b/src/f32-vbinary/gen/vminc-wasmsimd-x86-x8.c
index 56fd55e..b6b4a94 100644
--- a/src/f32-vbinary/gen/vminc-wasmsimd-x86-x8.c
+++ b/src/f32-vbinary/gen/vminc-wasmsimd-x86-x8.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x4.c b/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x4.c
index 7dbe2ef..f1969de 100644
--- a/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x4.c
+++ b/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x4.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x8.c b/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x8.c
index 98e6807..a97f55e 100644
--- a/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x8.c
+++ b/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-arm-x8.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x4.c b/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x4.c
index ca33b63..edce051 100644
--- a/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x4.c
+++ b/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x4.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x8.c b/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x8.c
index d76fb6c..a327ca8 100644
--- a/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x8.c
+++ b/src/f32-vbinary/gen/vmulc-minmax-wasmsimd-x86-x8.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vmulc-wasmsimd-x4.c b/src/f32-vbinary/gen/vmulc-wasmsimd-x4.c
index d50579d..2529220 100644
--- a/src/f32-vbinary/gen/vmulc-wasmsimd-x4.c
+++ b/src/f32-vbinary/gen/vmulc-wasmsimd-x4.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vmulc-wasmsimd-x8.c b/src/f32-vbinary/gen/vmulc-wasmsimd-x8.c
index 1d5f972..219ecb1 100644
--- a/src/f32-vbinary/gen/vmulc-wasmsimd-x8.c
+++ b/src/f32-vbinary/gen/vmulc-wasmsimd-x8.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x4.c b/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x4.c
index 55f2079..71bc27f 100644
--- a/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x4.c
+++ b/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x4.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x8.c b/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x8.c
index bc02f2c..911da0a 100644
--- a/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x8.c
+++ b/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-arm-x8.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x4.c b/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x4.c
index 04216f5..6c59816 100644
--- a/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x4.c
+++ b/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x4.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x8.c b/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x8.c
index 21ecca2..ebfcd2d 100644
--- a/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x8.c
+++ b/src/f32-vbinary/gen/vrdivc-minmax-wasmsimd-x86-x8.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vrdivc-wasmsimd-x4.c b/src/f32-vbinary/gen/vrdivc-wasmsimd-x4.c
index bbcf6c0..6b16362 100644
--- a/src/f32-vbinary/gen/vrdivc-wasmsimd-x4.c
+++ b/src/f32-vbinary/gen/vrdivc-wasmsimd-x4.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vrdivc-wasmsimd-x8.c b/src/f32-vbinary/gen/vrdivc-wasmsimd-x8.c
index 48d9528..3142998 100644
--- a/src/f32-vbinary/gen/vrdivc-wasmsimd-x8.c
+++ b/src/f32-vbinary/gen/vrdivc-wasmsimd-x8.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x4.c b/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x4.c
index 6099a52..f8cc404 100644
--- a/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x4.c
+++ b/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x4.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x8.c b/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x8.c
index 9062993..a05c990 100644
--- a/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x8.c
+++ b/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-arm-x8.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x4.c b/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x4.c
index a974232..d2d9986 100644
--- a/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x4.c
+++ b/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x4.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x8.c b/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x8.c
index 993e222..83abb9f 100644
--- a/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x8.c
+++ b/src/f32-vbinary/gen/vrsubc-minmax-wasmsimd-x86-x8.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vrsubc-wasmsimd-x4.c b/src/f32-vbinary/gen/vrsubc-wasmsimd-x4.c
index 21e51d1..c882376 100644
--- a/src/f32-vbinary/gen/vrsubc-wasmsimd-x4.c
+++ b/src/f32-vbinary/gen/vrsubc-wasmsimd-x4.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vrsubc-wasmsimd-x8.c b/src/f32-vbinary/gen/vrsubc-wasmsimd-x8.c
index 8c27319..3228688 100644
--- a/src/f32-vbinary/gen/vrsubc-wasmsimd-x8.c
+++ b/src/f32-vbinary/gen/vrsubc-wasmsimd-x8.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x4.c b/src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x4.c
index e1fa64c..dbe15f7 100644
--- a/src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x4.c
+++ b/src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x4.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x8.c b/src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x8.c
index f90aadb..520cf84 100644
--- a/src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x8.c
+++ b/src/f32-vbinary/gen/vsqrdiffc-wasmsimd-x8.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x4.c b/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x4.c
index 6177309..f392da1 100644
--- a/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x4.c
+++ b/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x4.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x8.c b/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x8.c
index 03b7b1d..bc78884 100644
--- a/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x8.c
+++ b/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-arm-x8.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x4.c b/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x4.c
index e521a74..4aef551 100644
--- a/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x4.c
+++ b/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x4.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x8.c b/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x8.c
index b4ba3af..941b5b5 100644
--- a/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x8.c
+++ b/src/f32-vbinary/gen/vsubc-minmax-wasmsimd-x86-x8.c
@@ -30,7 +30,6 @@
const v128_t vy_min = wasm_v32x4_load_splat(¶ms->scalar.min);
const v128_t vy_max = wasm_v32x4_load_splat(¶ms->scalar.max);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vsubc-wasmsimd-x4.c b/src/f32-vbinary/gen/vsubc-wasmsimd-x4.c
index 33d0b37..e2c5a1a 100644
--- a/src/f32-vbinary/gen/vsubc-wasmsimd-x4.c
+++ b/src/f32-vbinary/gen/vsubc-wasmsimd-x4.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 4 * sizeof(float); n -= 4 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/gen/vsubc-wasmsimd-x8.c b/src/f32-vbinary/gen/vsubc-wasmsimd-x8.c
index 4abbc1b..e4eb9e1 100644
--- a/src/f32-vbinary/gen/vsubc-wasmsimd-x8.c
+++ b/src/f32-vbinary/gen/vsubc-wasmsimd-x8.c
@@ -28,7 +28,6 @@
assert(b != NULL);
assert(y != NULL);
-
const v128_t vb = wasm_v32x4_load_splat(b);
for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
const v128_t va0123 = wasm_v128_load(a);
diff --git a/src/f32-vbinary/vopc-wasmsimd.c.in b/src/f32-vbinary/vopc-wasmsimd.c.in
index c8ba8a9..8e8303d 100644
--- a/src/f32-vbinary/vopc-wasmsimd.c.in
+++ b/src/f32-vbinary/vopc-wasmsimd.c.in
@@ -28,7 +28,7 @@
$ "SQRDIFF": lambda x: "wasm_f32x4_sub(%s, vb)" % x,
$}[OP]
$assert ACTIVATION in ["LINEAR", "RELU", "MINMAX"]
-$ARCH_SUFFIX = "" if ACTIVATION in ["LINEAR", "RELU"] else "_x86" if X86 else "_arm"
+$ARCH_SUFFIX = "" if ACTIVATION in ["LINEAR", "RELU"] and OP not in ["MIN", "MAX"] else "_x86" if X86 else "_arm"
$ACTIVATION_SUFFIX = {"LINEAR": ""}.get(ACTIVATION, "_" + ACTIVATION.lower())
$PARAMS = {"LINEAR": "xnn_f32_default_params", "RELU": "xnn_f32_relu_params", "MINMAX": "xnn_f32_minmax_params"}[ACTIVATION]
void xnn_f32_v${OP.lower()}c${ACTIVATION_SUFFIX}_ukernel__wasmsimd${ARCH_SUFFIX}_x${BATCH_TILE}(
diff --git a/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x16.c b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x16.c
new file mode 100644
index 0000000..593785a
--- /dev/null
+++ b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x16.c
@@ -0,0 +1,164 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vadd/sse-mul16-ld64.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16(
+ size_t n,
+ const int8_t* input_x,
+ const int8_t* input_y,
+ int8_t* output,
+ const union xnn_qs8_add_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ const __m128i vzero_point_product = _mm_load_si128((const __m128i*) ¶ms->sse2.zero_point_product);
+ const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_lo);
+ const __m128i vx_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_hi);
+ const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_lo);
+ const __m128i vy_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_hi);
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
+ const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+
+ for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
+ __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x);
+ __m128i vy01234567 = _mm_loadl_epi64((const __m128i*) input_y);
+ __m128i vx89ABCDEF = _mm_loadl_epi64((const __m128i*) (input_x + 8));
+ __m128i vy89ABCDEF = _mm_loadl_epi64((const __m128i*) (input_y + 8));
+ input_x += 16;
+ input_y += 16;
+
+ vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567));
+ vy01234567 = _mm_unpacklo_epi8(vy01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vy01234567));
+ vx89ABCDEF = _mm_unpacklo_epi8(vx89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vx89ABCDEF));
+ vy89ABCDEF = _mm_unpacklo_epi8(vy89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vy89ABCDEF));
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+ __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo);
+ __m128i vyprod89ABCDEFhi = _mm_mulhi_epu16(vy89ABCDEF, vy_multiplier_lo);
+ const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo);
+ const __m128i vyprod89ABCDEFlo = _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+ vxprod89ABCDEFhi = _mm_add_epi16(vxprod89ABCDEFhi, _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_hi));
+ vyprod89ABCDEFhi = _mm_add_epi16(vyprod89ABCDEFhi, _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+ vxprod89ABCDEFhi = _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo));
+ vyprod89ABCDEFhi = _mm_sub_epi16(vyprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vy89ABCDEF, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc89AB = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod89ABCDEFlo, vxprod89ABCDEFhi));
+ __m128i vaccCDEF = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod89ABCDEFlo, vxprod89ABCDEFhi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vyprod89ABCDEFlo, vyprod89ABCDEFhi));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vyprod89ABCDEFlo, vyprod89ABCDEFhi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+ const __m128i vrem89AB = _mm_add_epi32(_mm_and_si128(vacc89AB, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc89AB));
+ const __m128i vremCDEF = _mm_add_epi32(_mm_and_si128(vaccCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccCDEF));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+ vacc89AB = _mm_sub_epi32(_mm_sra_epi32(vacc89AB, vshift), _mm_cmpgt_epi32(vrem89AB, vremainder_threshold));
+ vaccCDEF = _mm_sub_epi32(_mm_sra_epi32(vaccCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+ vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
+
+ const __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ output += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x);
+ __m128i vy01234567 = _mm_loadl_epi64((const __m128i*) input_y);
+ input_x += 8;
+ input_y += 8;
+
+ vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567));
+ vy01234567 = _mm_unpacklo_epi8(vy01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vy01234567));
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+
+ __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
+ if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ n -= 8;
+ } else {
+ if (n & (4 * sizeof(int8_t))) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ output += 1;
+ }
+ n = 0;
+ }
+ } while (n != 0);
+ }
+}
diff --git a/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x24.c b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x24.c
new file mode 100644
index 0000000..1e54be4
--- /dev/null
+++ b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x24.c
@@ -0,0 +1,189 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vadd/sse-mul16-ld64.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24(
+ size_t n,
+ const int8_t* input_x,
+ const int8_t* input_y,
+ int8_t* output,
+ const union xnn_qs8_add_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ const __m128i vzero_point_product = _mm_load_si128((const __m128i*) ¶ms->sse2.zero_point_product);
+ const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_lo);
+ const __m128i vx_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_hi);
+ const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_lo);
+ const __m128i vy_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_hi);
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
+ const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+
+ for (; n >= 24 * sizeof(int8_t); n -= 24 * sizeof(int8_t)) {
+ __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x);
+ __m128i vy01234567 = _mm_loadl_epi64((const __m128i*) input_y);
+ __m128i vx89ABCDEF = _mm_loadl_epi64((const __m128i*) (input_x + 8));
+ __m128i vy89ABCDEF = _mm_loadl_epi64((const __m128i*) (input_y + 8));
+ __m128i vxGHIJKLMN = _mm_loadl_epi64((const __m128i*) (input_x + 16));
+ __m128i vyGHIJKLMN = _mm_loadl_epi64((const __m128i*) (input_y + 16));
+ input_x += 24;
+ input_y += 24;
+
+ vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567));
+ vy01234567 = _mm_unpacklo_epi8(vy01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vy01234567));
+ vx89ABCDEF = _mm_unpacklo_epi8(vx89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vx89ABCDEF));
+ vy89ABCDEF = _mm_unpacklo_epi8(vy89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vy89ABCDEF));
+ vxGHIJKLMN = _mm_unpacklo_epi8(vxGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), vxGHIJKLMN));
+ vyGHIJKLMN = _mm_unpacklo_epi8(vyGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), vyGHIJKLMN));
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+ __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo);
+ __m128i vyprod89ABCDEFhi = _mm_mulhi_epu16(vy89ABCDEF, vy_multiplier_lo);
+ const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo);
+ const __m128i vyprod89ABCDEFlo = _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_lo);
+ __m128i vxprodGHIJKLMNhi = _mm_mulhi_epu16(vxGHIJKLMN, vx_multiplier_lo);
+ __m128i vyprodGHIJKLMNhi = _mm_mulhi_epu16(vyGHIJKLMN, vy_multiplier_lo);
+ const __m128i vxprodGHIJKLMNlo = _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_lo);
+ const __m128i vyprodGHIJKLMNlo = _mm_mullo_epi16(vyGHIJKLMN, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+ vxprod89ABCDEFhi = _mm_add_epi16(vxprod89ABCDEFhi, _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_hi));
+ vyprod89ABCDEFhi = _mm_add_epi16(vyprod89ABCDEFhi, _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_hi));
+ vxprodGHIJKLMNhi = _mm_add_epi16(vxprodGHIJKLMNhi, _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_hi));
+ vyprodGHIJKLMNhi = _mm_add_epi16(vyprodGHIJKLMNhi, _mm_mullo_epi16(vyGHIJKLMN, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+ vxprod89ABCDEFhi = _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo));
+ vyprod89ABCDEFhi = _mm_sub_epi16(vyprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vy89ABCDEF, 15), vy_multiplier_lo));
+ vxprodGHIJKLMNhi = _mm_sub_epi16(vxprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vxGHIJKLMN, 15), vx_multiplier_lo));
+ vyprodGHIJKLMNhi = _mm_sub_epi16(vyprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vyGHIJKLMN, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc89AB = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod89ABCDEFlo, vxprod89ABCDEFhi));
+ __m128i vaccCDEF = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod89ABCDEFlo, vxprod89ABCDEFhi));
+ __m128i vaccGHIJ = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprodGHIJKLMNlo, vxprodGHIJKLMNhi));
+ __m128i vaccKLMN = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprodGHIJKLMNlo, vxprodGHIJKLMNhi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vyprod89ABCDEFlo, vyprod89ABCDEFhi));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vyprod89ABCDEFlo, vyprod89ABCDEFhi));
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_unpacklo_epi16(vyprodGHIJKLMNlo, vyprodGHIJKLMNhi));
+ vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_unpackhi_epi16(vyprodGHIJKLMNlo, vyprodGHIJKLMNhi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+ const __m128i vrem89AB = _mm_add_epi32(_mm_and_si128(vacc89AB, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc89AB));
+ const __m128i vremCDEF = _mm_add_epi32(_mm_and_si128(vaccCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccCDEF));
+ const __m128i vremGHIJ = _mm_add_epi32(_mm_and_si128(vaccGHIJ, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccGHIJ));
+ const __m128i vremKLMN = _mm_add_epi32(_mm_and_si128(vaccKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccKLMN));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+ vacc89AB = _mm_sub_epi32(_mm_sra_epi32(vacc89AB, vshift), _mm_cmpgt_epi32(vrem89AB, vremainder_threshold));
+ vaccCDEF = _mm_sub_epi32(_mm_sra_epi32(vaccCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
+ vaccGHIJ = _mm_sub_epi32(_mm_sra_epi32(vaccGHIJ, vshift), _mm_cmpgt_epi32(vremGHIJ, vremainder_threshold));
+ vaccKLMN = _mm_sub_epi32(_mm_sra_epi32(vaccKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+ __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
+
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+ voutGHIJKLMN = _mm_max_epi16(voutGHIJKLMN, voutput_min);
+
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+ vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
+ voutGHIJKLMN = _mm_min_epi16(voutGHIJKLMN, voutput_max);
+
+ const __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+ const __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
+ output += 24;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x);
+ __m128i vy01234567 = _mm_loadl_epi64((const __m128i*) input_y);
+ input_x += 8;
+ input_y += 8;
+
+ vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567));
+ vy01234567 = _mm_unpacklo_epi8(vy01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vy01234567));
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+
+ __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
+ if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ n -= 8;
+ } else {
+ if (n & (4 * sizeof(int8_t))) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ output += 1;
+ }
+ n = 0;
+ }
+ } while (n != 0);
+ }
+}
diff --git a/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x32.c b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x32.c
new file mode 100644
index 0000000..ca37d6d
--- /dev/null
+++ b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x32.c
@@ -0,0 +1,212 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vadd/sse-mul16-ld64.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32(
+ size_t n,
+ const int8_t* input_x,
+ const int8_t* input_y,
+ int8_t* output,
+ const union xnn_qs8_add_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ const __m128i vzero_point_product = _mm_load_si128((const __m128i*) ¶ms->sse2.zero_point_product);
+ const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_lo);
+ const __m128i vx_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_hi);
+ const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_lo);
+ const __m128i vy_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_hi);
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
+ const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+
+ for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
+ __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x);
+ __m128i vy01234567 = _mm_loadl_epi64((const __m128i*) input_y);
+ __m128i vx89ABCDEF = _mm_loadl_epi64((const __m128i*) (input_x + 8));
+ __m128i vy89ABCDEF = _mm_loadl_epi64((const __m128i*) (input_y + 8));
+ __m128i vxGHIJKLMN = _mm_loadl_epi64((const __m128i*) (input_x + 16));
+ __m128i vyGHIJKLMN = _mm_loadl_epi64((const __m128i*) (input_y + 16));
+ __m128i vxOPQRSTUV = _mm_loadl_epi64((const __m128i*) (input_x + 24));
+ __m128i vyOPQRSTUV = _mm_loadl_epi64((const __m128i*) (input_y + 24));
+ input_x += 32;
+ input_y += 32;
+
+ vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567));
+ vy01234567 = _mm_unpacklo_epi8(vy01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vy01234567));
+ vx89ABCDEF = _mm_unpacklo_epi8(vx89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vx89ABCDEF));
+ vy89ABCDEF = _mm_unpacklo_epi8(vy89ABCDEF, _mm_cmpgt_epi8(_mm_setzero_si128(), vy89ABCDEF));
+ vxGHIJKLMN = _mm_unpacklo_epi8(vxGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), vxGHIJKLMN));
+ vyGHIJKLMN = _mm_unpacklo_epi8(vyGHIJKLMN, _mm_cmpgt_epi8(_mm_setzero_si128(), vyGHIJKLMN));
+ vxOPQRSTUV = _mm_unpacklo_epi8(vxOPQRSTUV, _mm_cmpgt_epi8(_mm_setzero_si128(), vxOPQRSTUV));
+ vyOPQRSTUV = _mm_unpacklo_epi8(vyOPQRSTUV, _mm_cmpgt_epi8(_mm_setzero_si128(), vyOPQRSTUV));
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+ __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo);
+ __m128i vyprod89ABCDEFhi = _mm_mulhi_epu16(vy89ABCDEF, vy_multiplier_lo);
+ const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo);
+ const __m128i vyprod89ABCDEFlo = _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_lo);
+ __m128i vxprodGHIJKLMNhi = _mm_mulhi_epu16(vxGHIJKLMN, vx_multiplier_lo);
+ __m128i vyprodGHIJKLMNhi = _mm_mulhi_epu16(vyGHIJKLMN, vy_multiplier_lo);
+ const __m128i vxprodGHIJKLMNlo = _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_lo);
+ const __m128i vyprodGHIJKLMNlo = _mm_mullo_epi16(vyGHIJKLMN, vy_multiplier_lo);
+ __m128i vxprodOPQRSTUVhi = _mm_mulhi_epu16(vxOPQRSTUV, vx_multiplier_lo);
+ __m128i vyprodOPQRSTUVhi = _mm_mulhi_epu16(vyOPQRSTUV, vy_multiplier_lo);
+ const __m128i vxprodOPQRSTUVlo = _mm_mullo_epi16(vxOPQRSTUV, vx_multiplier_lo);
+ const __m128i vyprodOPQRSTUVlo = _mm_mullo_epi16(vyOPQRSTUV, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+ vxprod89ABCDEFhi = _mm_add_epi16(vxprod89ABCDEFhi, _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_hi));
+ vyprod89ABCDEFhi = _mm_add_epi16(vyprod89ABCDEFhi, _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_hi));
+ vxprodGHIJKLMNhi = _mm_add_epi16(vxprodGHIJKLMNhi, _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_hi));
+ vyprodGHIJKLMNhi = _mm_add_epi16(vyprodGHIJKLMNhi, _mm_mullo_epi16(vyGHIJKLMN, vy_multiplier_hi));
+ vxprodOPQRSTUVhi = _mm_add_epi16(vxprodOPQRSTUVhi, _mm_mullo_epi16(vxOPQRSTUV, vx_multiplier_hi));
+ vyprodOPQRSTUVhi = _mm_add_epi16(vyprodOPQRSTUVhi, _mm_mullo_epi16(vyOPQRSTUV, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+ vxprod89ABCDEFhi = _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo));
+ vyprod89ABCDEFhi = _mm_sub_epi16(vyprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vy89ABCDEF, 15), vy_multiplier_lo));
+ vxprodGHIJKLMNhi = _mm_sub_epi16(vxprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vxGHIJKLMN, 15), vx_multiplier_lo));
+ vyprodGHIJKLMNhi = _mm_sub_epi16(vyprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vyGHIJKLMN, 15), vy_multiplier_lo));
+ vxprodOPQRSTUVhi = _mm_sub_epi16(vxprodOPQRSTUVhi, _mm_and_si128(_mm_srai_epi16(vxOPQRSTUV, 15), vx_multiplier_lo));
+ vyprodOPQRSTUVhi = _mm_sub_epi16(vyprodOPQRSTUVhi, _mm_and_si128(_mm_srai_epi16(vyOPQRSTUV, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc89AB = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod89ABCDEFlo, vxprod89ABCDEFhi));
+ __m128i vaccCDEF = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod89ABCDEFlo, vxprod89ABCDEFhi));
+ __m128i vaccGHIJ = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprodGHIJKLMNlo, vxprodGHIJKLMNhi));
+ __m128i vaccKLMN = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprodGHIJKLMNlo, vxprodGHIJKLMNhi));
+ __m128i vaccOPQR = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprodOPQRSTUVlo, vxprodOPQRSTUVhi));
+ __m128i vaccSTUV = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprodOPQRSTUVlo, vxprodOPQRSTUVhi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vyprod89ABCDEFlo, vyprod89ABCDEFhi));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vyprod89ABCDEFlo, vyprod89ABCDEFhi));
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_unpacklo_epi16(vyprodGHIJKLMNlo, vyprodGHIJKLMNhi));
+ vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_unpackhi_epi16(vyprodGHIJKLMNlo, vyprodGHIJKLMNhi));
+ vaccOPQR = _mm_add_epi32(vaccOPQR, _mm_unpacklo_epi16(vyprodOPQRSTUVlo, vyprodOPQRSTUVhi));
+ vaccSTUV = _mm_add_epi32(vaccSTUV, _mm_unpackhi_epi16(vyprodOPQRSTUVlo, vyprodOPQRSTUVhi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+ const __m128i vrem89AB = _mm_add_epi32(_mm_and_si128(vacc89AB, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc89AB));
+ const __m128i vremCDEF = _mm_add_epi32(_mm_and_si128(vaccCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccCDEF));
+ const __m128i vremGHIJ = _mm_add_epi32(_mm_and_si128(vaccGHIJ, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccGHIJ));
+ const __m128i vremKLMN = _mm_add_epi32(_mm_and_si128(vaccKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccKLMN));
+ const __m128i vremOPQR = _mm_add_epi32(_mm_and_si128(vaccOPQR, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccOPQR));
+ const __m128i vremSTUV = _mm_add_epi32(_mm_and_si128(vaccSTUV, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccSTUV));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+ vacc89AB = _mm_sub_epi32(_mm_sra_epi32(vacc89AB, vshift), _mm_cmpgt_epi32(vrem89AB, vremainder_threshold));
+ vaccCDEF = _mm_sub_epi32(_mm_sra_epi32(vaccCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
+ vaccGHIJ = _mm_sub_epi32(_mm_sra_epi32(vaccGHIJ, vshift), _mm_cmpgt_epi32(vremGHIJ, vremainder_threshold));
+ vaccKLMN = _mm_sub_epi32(_mm_sra_epi32(vaccKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
+ vaccOPQR = _mm_sub_epi32(_mm_sra_epi32(vaccOPQR, vshift), _mm_cmpgt_epi32(vremOPQR, vremainder_threshold));
+ vaccSTUV = _mm_sub_epi32(_mm_sra_epi32(vaccSTUV, vshift), _mm_cmpgt_epi32(vremSTUV, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+ __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
+ __m128i voutOPQRSTUV = _mm_adds_epi16(_mm_packs_epi32(vaccOPQR, vaccSTUV), voutput_zero_point);
+
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+ voutGHIJKLMN = _mm_max_epi16(voutGHIJKLMN, voutput_min);
+ voutOPQRSTUV = _mm_max_epi16(voutOPQRSTUV, voutput_min);
+
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+ vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
+ voutGHIJKLMN = _mm_min_epi16(voutGHIJKLMN, voutput_max);
+ voutOPQRSTUV = _mm_min_epi16(voutOPQRSTUV, voutput_max);
+
+ const __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+ const __m128i voutGHIJKLMNOPQRSTUV = _mm_packs_epi16(voutGHIJKLMN, voutOPQRSTUV);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
+ output += 32;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x);
+ __m128i vy01234567 = _mm_loadl_epi64((const __m128i*) input_y);
+ input_x += 8;
+ input_y += 8;
+
+ vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567));
+ vy01234567 = _mm_unpacklo_epi8(vy01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vy01234567));
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+
+ __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
+ if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ n -= 8;
+ } else {
+ if (n & (4 * sizeof(int8_t))) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ output += 1;
+ }
+ n = 0;
+ }
+ } while (n != 0);
+ }
+}
diff --git a/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
new file mode 100644
index 0000000..b0bd593
--- /dev/null
+++ b/src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
@@ -0,0 +1,131 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vadd/sse-mul16-ld64.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <emmintrin.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8(
+ size_t n,
+ const int8_t* input_x,
+ const int8_t* input_y,
+ int8_t* output,
+ const union xnn_qs8_add_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ const __m128i vzero_point_product = _mm_load_si128((const __m128i*) ¶ms->sse2.zero_point_product);
+ const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_lo);
+ const __m128i vx_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_hi);
+ const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_lo);
+ const __m128i vy_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_hi);
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
+ const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x);
+ __m128i vy01234567 = _mm_loadl_epi64((const __m128i*) input_y);
+ input_x += 8;
+ input_y += 8;
+
+ vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567));
+ vy01234567 = _mm_unpacklo_epi8(vy01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vy01234567));
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+
+ const __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ {
+ __m128i vx01234567 = _mm_loadl_epi64((const __m128i*) input_x);
+ __m128i vy01234567 = _mm_loadl_epi64((const __m128i*) input_y);
+
+ vx01234567 = _mm_unpacklo_epi8(vx01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vx01234567));
+ vy01234567 = _mm_unpacklo_epi8(vy01234567, _mm_cmpgt_epi8(_mm_setzero_si128(), vy01234567));
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+
+ __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
+ if (n & (4 * sizeof(int8_t))) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ *output = (int32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ }
+ }
+ }
+}
diff --git a/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x16.c b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x16.c
new file mode 100644
index 0000000..f8bc4d1
--- /dev/null
+++ b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x16.c
@@ -0,0 +1,158 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vadd/sse-mul16-ld64.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16(
+ size_t n,
+ const int8_t* input_x,
+ const int8_t* input_y,
+ int8_t* output,
+ const union xnn_qs8_add_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ const __m128i vzero_point_product = _mm_load_si128((const __m128i*) ¶ms->sse2.zero_point_product);
+ const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_lo);
+ const __m128i vx_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_hi);
+ const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_lo);
+ const __m128i vy_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_hi);
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
+ const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+
+ for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
+ const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x));
+ const __m128i vy01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_y));
+ const __m128i vx89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_x + 8)));
+ const __m128i vy89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_y + 8)));
+ input_x += 16;
+ input_y += 16;
+
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+ __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo);
+ __m128i vyprod89ABCDEFhi = _mm_mulhi_epu16(vy89ABCDEF, vy_multiplier_lo);
+ const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo);
+ const __m128i vyprod89ABCDEFlo = _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+ vxprod89ABCDEFhi = _mm_add_epi16(vxprod89ABCDEFhi, _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_hi));
+ vyprod89ABCDEFhi = _mm_add_epi16(vyprod89ABCDEFhi, _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+ vxprod89ABCDEFhi = _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo));
+ vyprod89ABCDEFhi = _mm_sub_epi16(vyprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vy89ABCDEF, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc89AB = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod89ABCDEFlo, vxprod89ABCDEFhi));
+ __m128i vaccCDEF = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod89ABCDEFlo, vxprod89ABCDEFhi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vyprod89ABCDEFlo, vyprod89ABCDEFhi));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vyprod89ABCDEFlo, vyprod89ABCDEFhi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+ const __m128i vrem89AB = _mm_add_epi32(_mm_and_si128(vacc89AB, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc89AB));
+ const __m128i vremCDEF = _mm_add_epi32(_mm_and_si128(vaccCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccCDEF));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+ vacc89AB = _mm_sub_epi32(_mm_sra_epi32(vacc89AB, vshift), _mm_cmpgt_epi32(vrem89AB, vremainder_threshold));
+ vaccCDEF = _mm_sub_epi32(_mm_sra_epi32(vaccCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+ vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
+
+ const __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ output += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x));
+ const __m128i vy01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_y));
+ input_x += 8;
+ input_y += 8;
+
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+
+ __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
+ if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ n -= 8;
+ } else {
+ if (n & (4 * sizeof(int8_t))) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
+ output += 1;
+ }
+ n = 0;
+ }
+ } while (n != 0);
+ }
+}
diff --git a/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x24.c b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x24.c
new file mode 100644
index 0000000..01ffa3e
--- /dev/null
+++ b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x24.c
@@ -0,0 +1,181 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vadd/sse-mul16-ld64.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24(
+ size_t n,
+ const int8_t* input_x,
+ const int8_t* input_y,
+ int8_t* output,
+ const union xnn_qs8_add_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ const __m128i vzero_point_product = _mm_load_si128((const __m128i*) ¶ms->sse2.zero_point_product);
+ const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_lo);
+ const __m128i vx_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_hi);
+ const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_lo);
+ const __m128i vy_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_hi);
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
+ const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+
+ for (; n >= 24 * sizeof(int8_t); n -= 24 * sizeof(int8_t)) {
+ const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x));
+ const __m128i vy01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_y));
+ const __m128i vx89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_x + 8)));
+ const __m128i vy89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_y + 8)));
+ const __m128i vxGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_x + 16)));
+ const __m128i vyGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_y + 16)));
+ input_x += 24;
+ input_y += 24;
+
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+ __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo);
+ __m128i vyprod89ABCDEFhi = _mm_mulhi_epu16(vy89ABCDEF, vy_multiplier_lo);
+ const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo);
+ const __m128i vyprod89ABCDEFlo = _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_lo);
+ __m128i vxprodGHIJKLMNhi = _mm_mulhi_epu16(vxGHIJKLMN, vx_multiplier_lo);
+ __m128i vyprodGHIJKLMNhi = _mm_mulhi_epu16(vyGHIJKLMN, vy_multiplier_lo);
+ const __m128i vxprodGHIJKLMNlo = _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_lo);
+ const __m128i vyprodGHIJKLMNlo = _mm_mullo_epi16(vyGHIJKLMN, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+ vxprod89ABCDEFhi = _mm_add_epi16(vxprod89ABCDEFhi, _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_hi));
+ vyprod89ABCDEFhi = _mm_add_epi16(vyprod89ABCDEFhi, _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_hi));
+ vxprodGHIJKLMNhi = _mm_add_epi16(vxprodGHIJKLMNhi, _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_hi));
+ vyprodGHIJKLMNhi = _mm_add_epi16(vyprodGHIJKLMNhi, _mm_mullo_epi16(vyGHIJKLMN, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+ vxprod89ABCDEFhi = _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo));
+ vyprod89ABCDEFhi = _mm_sub_epi16(vyprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vy89ABCDEF, 15), vy_multiplier_lo));
+ vxprodGHIJKLMNhi = _mm_sub_epi16(vxprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vxGHIJKLMN, 15), vx_multiplier_lo));
+ vyprodGHIJKLMNhi = _mm_sub_epi16(vyprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vyGHIJKLMN, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc89AB = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod89ABCDEFlo, vxprod89ABCDEFhi));
+ __m128i vaccCDEF = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod89ABCDEFlo, vxprod89ABCDEFhi));
+ __m128i vaccGHIJ = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprodGHIJKLMNlo, vxprodGHIJKLMNhi));
+ __m128i vaccKLMN = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprodGHIJKLMNlo, vxprodGHIJKLMNhi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vyprod89ABCDEFlo, vyprod89ABCDEFhi));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vyprod89ABCDEFlo, vyprod89ABCDEFhi));
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_unpacklo_epi16(vyprodGHIJKLMNlo, vyprodGHIJKLMNhi));
+ vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_unpackhi_epi16(vyprodGHIJKLMNlo, vyprodGHIJKLMNhi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+ const __m128i vrem89AB = _mm_add_epi32(_mm_and_si128(vacc89AB, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc89AB));
+ const __m128i vremCDEF = _mm_add_epi32(_mm_and_si128(vaccCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccCDEF));
+ const __m128i vremGHIJ = _mm_add_epi32(_mm_and_si128(vaccGHIJ, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccGHIJ));
+ const __m128i vremKLMN = _mm_add_epi32(_mm_and_si128(vaccKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccKLMN));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+ vacc89AB = _mm_sub_epi32(_mm_sra_epi32(vacc89AB, vshift), _mm_cmpgt_epi32(vrem89AB, vremainder_threshold));
+ vaccCDEF = _mm_sub_epi32(_mm_sra_epi32(vaccCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
+ vaccGHIJ = _mm_sub_epi32(_mm_sra_epi32(vaccGHIJ, vshift), _mm_cmpgt_epi32(vremGHIJ, vremainder_threshold));
+ vaccKLMN = _mm_sub_epi32(_mm_sra_epi32(vaccKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+ __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
+
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+ voutGHIJKLMN = _mm_max_epi16(voutGHIJKLMN, voutput_min);
+
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+ vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
+ voutGHIJKLMN = _mm_min_epi16(voutGHIJKLMN, voutput_max);
+
+ const __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+ const __m128i voutGHIJKLMNGHIJKLMN = _mm_packs_epi16(voutGHIJKLMN, voutGHIJKLMN);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ _mm_storel_epi64((__m128i*) (output + 16), voutGHIJKLMNGHIJKLMN);
+ output += 24;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x));
+ const __m128i vy01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_y));
+ input_x += 8;
+ input_y += 8;
+
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+
+ __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
+ if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ n -= 8;
+ } else {
+ if (n & (4 * sizeof(int8_t))) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
+ output += 1;
+ }
+ n = 0;
+ }
+ } while (n != 0);
+ }
+}
diff --git a/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x32.c b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x32.c
new file mode 100644
index 0000000..57ba9e5
--- /dev/null
+++ b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x32.c
@@ -0,0 +1,202 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vadd/sse-mul16-ld64.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32(
+ size_t n,
+ const int8_t* input_x,
+ const int8_t* input_y,
+ int8_t* output,
+ const union xnn_qs8_add_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ const __m128i vzero_point_product = _mm_load_si128((const __m128i*) ¶ms->sse2.zero_point_product);
+ const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_lo);
+ const __m128i vx_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_hi);
+ const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_lo);
+ const __m128i vy_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_hi);
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
+ const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+
+ for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
+ const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x));
+ const __m128i vy01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_y));
+ const __m128i vx89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_x + 8)));
+ const __m128i vy89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_y + 8)));
+ const __m128i vxGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_x + 16)));
+ const __m128i vyGHIJKLMN = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_y + 16)));
+ const __m128i vxOPQRSTUV = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_x + 24)));
+ const __m128i vyOPQRSTUV = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_y + 24)));
+ input_x += 32;
+ input_y += 32;
+
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+ __m128i vxprod89ABCDEFhi = _mm_mulhi_epu16(vx89ABCDEF, vx_multiplier_lo);
+ __m128i vyprod89ABCDEFhi = _mm_mulhi_epu16(vy89ABCDEF, vy_multiplier_lo);
+ const __m128i vxprod89ABCDEFlo = _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_lo);
+ const __m128i vyprod89ABCDEFlo = _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_lo);
+ __m128i vxprodGHIJKLMNhi = _mm_mulhi_epu16(vxGHIJKLMN, vx_multiplier_lo);
+ __m128i vyprodGHIJKLMNhi = _mm_mulhi_epu16(vyGHIJKLMN, vy_multiplier_lo);
+ const __m128i vxprodGHIJKLMNlo = _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_lo);
+ const __m128i vyprodGHIJKLMNlo = _mm_mullo_epi16(vyGHIJKLMN, vy_multiplier_lo);
+ __m128i vxprodOPQRSTUVhi = _mm_mulhi_epu16(vxOPQRSTUV, vx_multiplier_lo);
+ __m128i vyprodOPQRSTUVhi = _mm_mulhi_epu16(vyOPQRSTUV, vy_multiplier_lo);
+ const __m128i vxprodOPQRSTUVlo = _mm_mullo_epi16(vxOPQRSTUV, vx_multiplier_lo);
+ const __m128i vyprodOPQRSTUVlo = _mm_mullo_epi16(vyOPQRSTUV, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+ vxprod89ABCDEFhi = _mm_add_epi16(vxprod89ABCDEFhi, _mm_mullo_epi16(vx89ABCDEF, vx_multiplier_hi));
+ vyprod89ABCDEFhi = _mm_add_epi16(vyprod89ABCDEFhi, _mm_mullo_epi16(vy89ABCDEF, vy_multiplier_hi));
+ vxprodGHIJKLMNhi = _mm_add_epi16(vxprodGHIJKLMNhi, _mm_mullo_epi16(vxGHIJKLMN, vx_multiplier_hi));
+ vyprodGHIJKLMNhi = _mm_add_epi16(vyprodGHIJKLMNhi, _mm_mullo_epi16(vyGHIJKLMN, vy_multiplier_hi));
+ vxprodOPQRSTUVhi = _mm_add_epi16(vxprodOPQRSTUVhi, _mm_mullo_epi16(vxOPQRSTUV, vx_multiplier_hi));
+ vyprodOPQRSTUVhi = _mm_add_epi16(vyprodOPQRSTUVhi, _mm_mullo_epi16(vyOPQRSTUV, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+ vxprod89ABCDEFhi = _mm_sub_epi16(vxprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vx89ABCDEF, 15), vx_multiplier_lo));
+ vyprod89ABCDEFhi = _mm_sub_epi16(vyprod89ABCDEFhi, _mm_and_si128(_mm_srai_epi16(vy89ABCDEF, 15), vy_multiplier_lo));
+ vxprodGHIJKLMNhi = _mm_sub_epi16(vxprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vxGHIJKLMN, 15), vx_multiplier_lo));
+ vyprodGHIJKLMNhi = _mm_sub_epi16(vyprodGHIJKLMNhi, _mm_and_si128(_mm_srai_epi16(vyGHIJKLMN, 15), vy_multiplier_lo));
+ vxprodOPQRSTUVhi = _mm_sub_epi16(vxprodOPQRSTUVhi, _mm_and_si128(_mm_srai_epi16(vxOPQRSTUV, 15), vx_multiplier_lo));
+ vyprodOPQRSTUVhi = _mm_sub_epi16(vyprodOPQRSTUVhi, _mm_and_si128(_mm_srai_epi16(vyOPQRSTUV, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc89AB = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod89ABCDEFlo, vxprod89ABCDEFhi));
+ __m128i vaccCDEF = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod89ABCDEFlo, vxprod89ABCDEFhi));
+ __m128i vaccGHIJ = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprodGHIJKLMNlo, vxprodGHIJKLMNhi));
+ __m128i vaccKLMN = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprodGHIJKLMNlo, vxprodGHIJKLMNhi));
+ __m128i vaccOPQR = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprodOPQRSTUVlo, vxprodOPQRSTUVhi));
+ __m128i vaccSTUV = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprodOPQRSTUVlo, vxprodOPQRSTUVhi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vyprod89ABCDEFlo, vyprod89ABCDEFhi));
+ vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vyprod89ABCDEFlo, vyprod89ABCDEFhi));
+ vaccGHIJ = _mm_add_epi32(vaccGHIJ, _mm_unpacklo_epi16(vyprodGHIJKLMNlo, vyprodGHIJKLMNhi));
+ vaccKLMN = _mm_add_epi32(vaccKLMN, _mm_unpackhi_epi16(vyprodGHIJKLMNlo, vyprodGHIJKLMNhi));
+ vaccOPQR = _mm_add_epi32(vaccOPQR, _mm_unpacklo_epi16(vyprodOPQRSTUVlo, vyprodOPQRSTUVhi));
+ vaccSTUV = _mm_add_epi32(vaccSTUV, _mm_unpackhi_epi16(vyprodOPQRSTUVlo, vyprodOPQRSTUVhi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+ const __m128i vrem89AB = _mm_add_epi32(_mm_and_si128(vacc89AB, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc89AB));
+ const __m128i vremCDEF = _mm_add_epi32(_mm_and_si128(vaccCDEF, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccCDEF));
+ const __m128i vremGHIJ = _mm_add_epi32(_mm_and_si128(vaccGHIJ, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccGHIJ));
+ const __m128i vremKLMN = _mm_add_epi32(_mm_and_si128(vaccKLMN, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccKLMN));
+ const __m128i vremOPQR = _mm_add_epi32(_mm_and_si128(vaccOPQR, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccOPQR));
+ const __m128i vremSTUV = _mm_add_epi32(_mm_and_si128(vaccSTUV, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vaccSTUV));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+ vacc89AB = _mm_sub_epi32(_mm_sra_epi32(vacc89AB, vshift), _mm_cmpgt_epi32(vrem89AB, vremainder_threshold));
+ vaccCDEF = _mm_sub_epi32(_mm_sra_epi32(vaccCDEF, vshift), _mm_cmpgt_epi32(vremCDEF, vremainder_threshold));
+ vaccGHIJ = _mm_sub_epi32(_mm_sra_epi32(vaccGHIJ, vshift), _mm_cmpgt_epi32(vremGHIJ, vremainder_threshold));
+ vaccKLMN = _mm_sub_epi32(_mm_sra_epi32(vaccKLMN, vshift), _mm_cmpgt_epi32(vremKLMN, vremainder_threshold));
+ vaccOPQR = _mm_sub_epi32(_mm_sra_epi32(vaccOPQR, vshift), _mm_cmpgt_epi32(vremOPQR, vremainder_threshold));
+ vaccSTUV = _mm_sub_epi32(_mm_sra_epi32(vaccSTUV, vshift), _mm_cmpgt_epi32(vremSTUV, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
+ __m128i voutGHIJKLMN = _mm_adds_epi16(_mm_packs_epi32(vaccGHIJ, vaccKLMN), voutput_zero_point);
+ __m128i voutOPQRSTUV = _mm_adds_epi16(_mm_packs_epi32(vaccOPQR, vaccSTUV), voutput_zero_point);
+
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout89ABCDEF = _mm_max_epi16(vout89ABCDEF, voutput_min);
+ voutGHIJKLMN = _mm_max_epi16(voutGHIJKLMN, voutput_min);
+ voutOPQRSTUV = _mm_max_epi16(voutOPQRSTUV, voutput_min);
+
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+ vout89ABCDEF = _mm_min_epi16(vout89ABCDEF, voutput_max);
+ voutGHIJKLMN = _mm_min_epi16(voutGHIJKLMN, voutput_max);
+ voutOPQRSTUV = _mm_min_epi16(voutOPQRSTUV, voutput_max);
+
+ const __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
+ const __m128i voutGHIJKLMNOPQRSTUV = _mm_packs_epi16(voutGHIJKLMN, voutOPQRSTUV);
+
+ _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
+ _mm_storeu_si128((__m128i*) (output + 16), voutGHIJKLMNOPQRSTUV);
+ output += 32;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x));
+ const __m128i vy01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_y));
+ input_x += 8;
+ input_y += 8;
+
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+
+ __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
+ if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ n -= 8;
+ } else {
+ if (n & (4 * sizeof(int8_t))) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
+ output += 1;
+ }
+ n = 0;
+ }
+ } while (n != 0);
+ }
+}
diff --git a/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c
new file mode 100644
index 0000000..0187c85
--- /dev/null
+++ b/src/qs8-vadd/gen/minmax-sse41-mul16-ld64-x8.c
@@ -0,0 +1,127 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vadd/sse-mul16-ld64.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8(
+ size_t n,
+ const int8_t* input_x,
+ const int8_t* input_y,
+ int8_t* output,
+ const union xnn_qs8_add_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ const __m128i vzero_point_product = _mm_load_si128((const __m128i*) ¶ms->sse2.zero_point_product);
+ const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_lo);
+ const __m128i vx_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_hi);
+ const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_lo);
+ const __m128i vy_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_hi);
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
+ const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+
+ for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
+ const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x));
+ const __m128i vy01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_y));
+ input_x += 8;
+ input_y += 8;
+
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+
+ const __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
+ _mm_storel_epi64((__m128i*) output, vout0123456701234567);
+ output += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ {
+ const __m128i vx01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x));
+ const __m128i vy01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_y));
+
+
+ __m128i vxprod01234567hi = _mm_mulhi_epu16(vx01234567, vx_multiplier_lo);
+ __m128i vyprod01234567hi = _mm_mulhi_epu16(vy01234567, vy_multiplier_lo);
+ const __m128i vxprod01234567lo = _mm_mullo_epi16(vx01234567, vx_multiplier_lo);
+ const __m128i vyprod01234567lo = _mm_mullo_epi16(vy01234567, vy_multiplier_lo);
+
+ vxprod01234567hi = _mm_add_epi16(vxprod01234567hi, _mm_mullo_epi16(vx01234567, vx_multiplier_hi));
+ vyprod01234567hi = _mm_add_epi16(vyprod01234567hi, _mm_mullo_epi16(vy01234567, vy_multiplier_hi));
+
+ vxprod01234567hi = _mm_sub_epi16(vxprod01234567hi, _mm_and_si128(_mm_srai_epi16(vx01234567, 15), vx_multiplier_lo));
+ vyprod01234567hi = _mm_sub_epi16(vyprod01234567hi, _mm_and_si128(_mm_srai_epi16(vy01234567, 15), vy_multiplier_lo));
+
+ __m128i vacc0123 = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod01234567lo, vxprod01234567hi));
+ __m128i vacc4567 = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod01234567lo, vxprod01234567hi));
+
+ vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vyprod01234567lo, vyprod01234567hi));
+ vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vyprod01234567lo, vyprod01234567hi));
+
+ const __m128i vrem0123 = _mm_add_epi32(_mm_and_si128(vacc0123, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc0123));
+ const __m128i vrem4567 = _mm_add_epi32(_mm_and_si128(vacc4567, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc4567));
+
+ vacc0123 = _mm_sub_epi32(_mm_sra_epi32(vacc0123, vshift), _mm_cmpgt_epi32(vrem0123, vremainder_threshold));
+ vacc4567 = _mm_sub_epi32(_mm_sra_epi32(vacc4567, vshift), _mm_cmpgt_epi32(vrem4567, vremainder_threshold));
+
+ __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
+ vout01234567 = _mm_max_epi16(vout01234567, voutput_min);
+ vout01234567 = _mm_min_epi16(vout01234567, voutput_max);
+
+ __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
+
+ if (n & (4 * sizeof(int8_t))) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout0123456701234567);
+ vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
+ output += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout0123456701234567, 0);
+ vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
+ output += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
+ }
+ }
+ }
+}
diff --git a/src/qs8-vadd/sse-mul16-ld64.c.in b/src/qs8-vadd/sse-mul16-ld64.c.in
new file mode 100644
index 0000000..453a6b2
--- /dev/null
+++ b/src/qs8-vadd/sse-mul16-ld64.c.in
@@ -0,0 +1,201 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$SSE_HEADER = {2: "emmintrin.h", 4: "smmintrin.h"}[SSE]
+$assert BATCH_TILE % 8 == 0
+$assert BATCH_TILE >= 8
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <${SSE_HEADER}>
+
+#include <xnnpack/vadd.h>
+
+
+$ISA = {2: "sse2", 4: "sse41"}[SSE]
+void xnn_qs8_vadd_minmax_ukernel__${ISA}_mul16_ld64_x${BATCH_TILE}(
+ size_t n,
+ const int8_t* input_x,
+ const int8_t* input_y,
+ int8_t* output,
+ const union xnn_qs8_add_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ const __m128i vzero_point_product = _mm_load_si128((const __m128i*) ¶ms->sse2.zero_point_product);
+ const __m128i vx_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_lo);
+ const __m128i vx_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.x_multiplier_hi);
+ const __m128i vy_multiplier_lo = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_lo);
+ const __m128i vy_multiplier_hi = _mm_load_si128((const __m128i*) ¶ms->sse2.y_multiplier_hi);
+ const __m128i vremainder_mask = _mm_load_si128((const __m128i*) params->sse2.remainder_mask);
+ const __m128i vremainder_threshold = _mm_load_si128((const __m128i*) params->sse2.remainder_threshold);
+ const __m128i vshift = _mm_cvtsi32_si128((int) params->sse2.shift);
+ const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse2.output_zero_point);
+ const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse2.output_min);
+ const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse2.output_max);
+
+ for (; n >= ${BATCH_TILE} * sizeof(int8_t); n -= ${BATCH_TILE} * sizeof(int8_t)) {
+ $if SSE >= 4:
+ const __m128i vx${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x));
+ const __m128i vy${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_y));
+ $for N in range(8, BATCH_TILE, 8):
+ const __m128i vx${ABC[N:N+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_x + ${N})));
+ const __m128i vy${ABC[N:N+8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_y + ${N})));
+ $else:
+ __m128i vx${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_x);
+ __m128i vy${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_y);
+ $for N in range(8, BATCH_TILE, 8):
+ __m128i vx${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_x + ${N}));
+ __m128i vy${ABC[N:N+8]} = _mm_loadl_epi64((const __m128i*) (input_y + ${N}));
+ input_x += ${BATCH_TILE};
+ input_y += ${BATCH_TILE};
+
+ $if SSE < 4:
+ $for N in range(0, BATCH_TILE, 8):
+ vx${ABC[N:N+8]} = _mm_unpacklo_epi8(vx${ABC[N:N+8]}, _mm_cmpgt_epi8(_mm_setzero_si128(), vx${ABC[N:N+8]}));
+ vy${ABC[N:N+8]} = _mm_unpacklo_epi8(vy${ABC[N:N+8]}, _mm_cmpgt_epi8(_mm_setzero_si128(), vy${ABC[N:N+8]}));
+
+ $for N in range(0, BATCH_TILE, 8):
+ __m128i vxprod${ABC[N:N+8]}hi = _mm_mulhi_epu16(vx${ABC[N:N+8]}, vx_multiplier_lo);
+ __m128i vyprod${ABC[N:N+8]}hi = _mm_mulhi_epu16(vy${ABC[N:N+8]}, vy_multiplier_lo);
+ const __m128i vxprod${ABC[N:N+8]}lo = _mm_mullo_epi16(vx${ABC[N:N+8]}, vx_multiplier_lo);
+ const __m128i vyprod${ABC[N:N+8]}lo = _mm_mullo_epi16(vy${ABC[N:N+8]}, vy_multiplier_lo);
+
+ $for N in range(0, BATCH_TILE, 8):
+ vxprod${ABC[N:N+8]}hi = _mm_add_epi16(vxprod${ABC[N:N+8]}hi, _mm_mullo_epi16(vx${ABC[N:N+8]}, vx_multiplier_hi));
+ vyprod${ABC[N:N+8]}hi = _mm_add_epi16(vyprod${ABC[N:N+8]}hi, _mm_mullo_epi16(vy${ABC[N:N+8]}, vy_multiplier_hi));
+
+ $for N in range(0, BATCH_TILE, 8):
+ vxprod${ABC[N:N+8]}hi = _mm_sub_epi16(vxprod${ABC[N:N+8]}hi, _mm_and_si128(_mm_srai_epi16(vx${ABC[N:N+8]}, 15), vx_multiplier_lo));
+ vyprod${ABC[N:N+8]}hi = _mm_sub_epi16(vyprod${ABC[N:N+8]}hi, _mm_and_si128(_mm_srai_epi16(vy${ABC[N:N+8]}, 15), vy_multiplier_lo));
+
+ $for N in range(0, BATCH_TILE, 8):
+ __m128i vacc${ABC[N:N+4]} = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod${ABC[N:N+8]}lo, vxprod${ABC[N:N+8]}hi));
+ __m128i vacc${ABC[N+4:N+8]} = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod${ABC[N:N+8]}lo, vxprod${ABC[N:N+8]}hi));
+
+ $for N in range(0, BATCH_TILE, 8):
+ vacc${ABC[N:N+4]} = _mm_add_epi32(vacc${ABC[N:N+4]}, _mm_unpacklo_epi16(vyprod${ABC[N:N+8]}lo, vyprod${ABC[N:N+8]}hi));
+ vacc${ABC[N+4:N+8]} = _mm_add_epi32(vacc${ABC[N+4:N+8]}, _mm_unpackhi_epi16(vyprod${ABC[N:N+8]}lo, vyprod${ABC[N:N+8]}hi));
+
+ $for N in range(0, BATCH_TILE, 4):
+ const __m128i vrem${ABC[N:N+4]} = _mm_add_epi32(_mm_and_si128(vacc${ABC[N:N+4]}, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc${ABC[N:N+4]}));
+
+ $for N in range(0, BATCH_TILE, 4):
+ vacc${ABC[N:N+4]} = _mm_sub_epi32(_mm_sra_epi32(vacc${ABC[N:N+4]}, vshift), _mm_cmpgt_epi32(vrem${ABC[N:N+4]}, vremainder_threshold));
+
+ $for N in range(0, BATCH_TILE, 8):
+ __m128i vout${ABC[N:N+8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[N:N+4]}, vacc${ABC[N+4:N+8]}), voutput_zero_point);
+
+ $for N in range(0, BATCH_TILE, 8):
+ vout${ABC[N:N+8]} = _mm_max_epi16(vout${ABC[N:N+8]}, voutput_min);
+
+ $for N in range(0, BATCH_TILE, 8):
+ vout${ABC[N:N+8]} = _mm_min_epi16(vout${ABC[N:N+8]}, voutput_max);
+
+ $for N in range(0, BATCH_TILE, 16):
+ $if N + 8 < BATCH_TILE:
+ const __m128i vout${ABC[N:N+16]} = _mm_packs_epi16(vout${ABC[N:N+8]}, vout${ABC[N+8:N+16]});
+ $else:
+ const __m128i vout${ABC[N:N+8]}${ABC[N:N+8]} = _mm_packs_epi16(vout${ABC[N:N+8]}, vout${ABC[N:N+8]});
+
+ $if BATCH_TILE >= 16:
+ _mm_storeu_si128((__m128i*) output, vout${ABC[0:16]});
+ $else:
+ _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]});
+ $for N in range(16, BATCH_TILE, 16):
+ $if N + 8 < BATCH_TILE:
+ _mm_storeu_si128((__m128i*) (output + ${N}), vout${ABC[N:N+16]});
+ $else:
+ _mm_storel_epi64((__m128i*) (output + ${N}), vout${ABC[N:N+8]}${ABC[N:N+8]});
+ output += ${BATCH_TILE};
+ }
+ if XNN_UNLIKELY(n != 0) {
+ ${"do " if BATCH_TILE > 8 else ""}{
+ $if SSE >= 4:
+ const __m128i vx${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_x));
+ const __m128i vy${ABC[0:8]} = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_y));
+ $else:
+ __m128i vx${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_x);
+ __m128i vy${ABC[0:8]} = _mm_loadl_epi64((const __m128i*) input_y);
+ $if BATCH_TILE > 8:
+ input_x += 8;
+ input_y += 8;
+
+ $if SSE < 4:
+ vx${ABC[0:8]} = _mm_unpacklo_epi8(vx${ABC[0:8]}, _mm_cmpgt_epi8(_mm_setzero_si128(), vx${ABC[0:8]}));
+ vy${ABC[0:8]} = _mm_unpacklo_epi8(vy${ABC[0:8]}, _mm_cmpgt_epi8(_mm_setzero_si128(), vy${ABC[0:8]}));
+
+ __m128i vxprod${ABC[0:8]}hi = _mm_mulhi_epu16(vx${ABC[0:8]}, vx_multiplier_lo);
+ __m128i vyprod${ABC[0:8]}hi = _mm_mulhi_epu16(vy${ABC[0:8]}, vy_multiplier_lo);
+ const __m128i vxprod${ABC[0:8]}lo = _mm_mullo_epi16(vx${ABC[0:8]}, vx_multiplier_lo);
+ const __m128i vyprod${ABC[0:8]}lo = _mm_mullo_epi16(vy${ABC[0:8]}, vy_multiplier_lo);
+
+ vxprod${ABC[0:8]}hi = _mm_add_epi16(vxprod${ABC[0:8]}hi, _mm_mullo_epi16(vx${ABC[0:8]}, vx_multiplier_hi));
+ vyprod${ABC[0:8]}hi = _mm_add_epi16(vyprod${ABC[0:8]}hi, _mm_mullo_epi16(vy${ABC[0:8]}, vy_multiplier_hi));
+
+ vxprod${ABC[0:8]}hi = _mm_sub_epi16(vxprod${ABC[0:8]}hi, _mm_and_si128(_mm_srai_epi16(vx${ABC[0:8]}, 15), vx_multiplier_lo));
+ vyprod${ABC[0:8]}hi = _mm_sub_epi16(vyprod${ABC[0:8]}hi, _mm_and_si128(_mm_srai_epi16(vy${ABC[0:8]}, 15), vy_multiplier_lo));
+
+ __m128i vacc${ABC[0:4]} = _mm_add_epi32(vzero_point_product, _mm_unpacklo_epi16(vxprod${ABC[0:8]}lo, vxprod${ABC[0:8]}hi));
+ __m128i vacc${ABC[4:8]} = _mm_add_epi32(vzero_point_product, _mm_unpackhi_epi16(vxprod${ABC[0:8]}lo, vxprod${ABC[0:8]}hi));
+
+ vacc${ABC[0:4]} = _mm_add_epi32(vacc${ABC[0:4]}, _mm_unpacklo_epi16(vyprod${ABC[0:8]}lo, vyprod${ABC[0:8]}hi));
+ vacc${ABC[4:8]} = _mm_add_epi32(vacc${ABC[4:8]}, _mm_unpackhi_epi16(vyprod${ABC[0:8]}lo, vyprod${ABC[0:8]}hi));
+
+ const __m128i vrem${ABC[0:4]} = _mm_add_epi32(_mm_and_si128(vacc${ABC[0:4]}, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc${ABC[0:4]}));
+ const __m128i vrem${ABC[4:8]} = _mm_add_epi32(_mm_and_si128(vacc${ABC[4:8]}, vremainder_mask), _mm_cmpgt_epi32(_mm_setzero_si128(), vacc${ABC[4:8]}));
+
+ vacc${ABC[0:4]} = _mm_sub_epi32(_mm_sra_epi32(vacc${ABC[0:4]}, vshift), _mm_cmpgt_epi32(vrem${ABC[0:4]}, vremainder_threshold));
+ vacc${ABC[4:8]} = _mm_sub_epi32(_mm_sra_epi32(vacc${ABC[4:8]}, vshift), _mm_cmpgt_epi32(vrem${ABC[4:8]}, vremainder_threshold));
+
+ __m128i vout${ABC[0:8]} = _mm_adds_epi16(_mm_packs_epi32(vacc${ABC[0:4]}, vacc${ABC[4:8]}), voutput_zero_point);
+ vout${ABC[0:8]} = _mm_max_epi16(vout${ABC[0:8]}, voutput_min);
+ vout${ABC[0:8]} = _mm_min_epi16(vout${ABC[0:8]}, voutput_max);
+
+ __m128i vout${ABC[0:8]}${ABC[0:8]} = _mm_packs_epi16(vout${ABC[0:8]}, vout${ABC[0:8]});
+
+ $if BATCH_TILE > 8:
+ if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
+ _mm_storel_epi64((__m128i*) output, vout${ABC[0:8]}${ABC[0:8]});
+ output += 8;
+ n -= 8;
+ } else {
+ if (n & (4 * sizeof(int8_t))) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+ vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
+ output += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0);
+ vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16);
+ output += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ $if SSE >= 4:
+ *output = (int8_t) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
+ $else:
+ *output = (int32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+ output += 1;
+ }
+ n = 0;
+ }
+ $else:
+ if (n & (4 * sizeof(int8_t))) {
+ *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+ vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi64(vout${ABC[0:8]}${ABC[0:8]}, 32);
+ output += 4;
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout${ABC[0:8]}${ABC[0:8]}, 0);
+ vout${ABC[0:8]}${ABC[0:8]} = _mm_srli_epi32(vout${ABC[0:8]}${ABC[0:8]}, 16);
+ output += 2;
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ $if SSE >= 4:
+ *output = (int8_t) _mm_extract_epi8(vout${ABC[0:8]}${ABC[0:8]}, 0);
+ $else:
+ *output = (int32_t) _mm_cvtsi128_si32(vout${ABC[0:8]}${ABC[0:8]});
+ }
+ }${" while (n != 0);" if BATCH_TILE > 8 else ""}
+ }
+}
diff --git a/src/xnnpack/params-init.h b/src/xnnpack/params-init.h
index a0cd221..793284f 100644
--- a/src/xnnpack/params-init.h
+++ b/src/xnnpack/params-init.h
@@ -1137,7 +1137,7 @@
assert(b_output_scale < 0x1.0p+8f);
// Compute requantization parameters.
- const float max_output_scale = a_output_scale > b_output_scale ? a_output_scale : b_output_scale;
+ const float max_output_scale = math_max_f32(a_output_scale, b_output_scale);
assert(max_output_scale >= 0x1.0p-14f);
assert(max_output_scale < 0x1.0p+8f);
const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
@@ -1226,7 +1226,7 @@
assert(b_output_scale < 0x1.0p+8f);
// Compute requantization parameters.
- const float max_output_scale = a_output_scale > b_output_scale ? a_output_scale : b_output_scale;
+ const float max_output_scale = math_max_f32(a_output_scale, b_output_scale);
assert(max_output_scale >= 0x1.0p-10f);
assert(max_output_scale < 0x1.0p+8f);
const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
@@ -1259,6 +1259,143 @@
return params;
}
+static inline union xnn_qs8_add_params xnn_init_qs8_add_params(
+ int8_t x_zero_point,
+ int8_t y_zero_point,
+ int8_t output_zero_point,
+ float x_output_scale,
+ float y_output_scale,
+ int8_t output_min,
+ int8_t output_max)
+{
+ assert(x_output_scale >= 0x1.0p-14f);
+ assert(y_output_scale >= 0x1.0p-14f);
+ assert(x_output_scale < 0x1.0p+8f);
+ assert(y_output_scale < 0x1.0p+8f);
+
+ // Compute requantization parameters.
+ const float max_output_scale = math_max_f32(x_output_scale, y_output_scale);
+ assert(max_output_scale >= 0x1.0p-14f);
+ assert(max_output_scale < 0x1.0p+8f);
+ const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
+ const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
+ // Shift is in [13, 31] range.
+ const uint32_t shift = (uint32_t) (21 - max_scale_exponent);
+ assert(shift < 32);
+ assert(shift >= 13);
+
+ const float scale_multiplier = fp32_from_bits((uint32_t) (21 - max_scale_exponent + 127) << 23);
+
+ // Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range.
+ const int32_t x_multiplier = (int32_t) lrintf(x_output_scale * scale_multiplier);
+ const int32_t y_multiplier = (int32_t) lrintf(y_output_scale * scale_multiplier);
+ assert((x_multiplier > y_multiplier ? x_multiplier : y_multiplier) >= INT32_C(0x00200000));
+ assert(x_multiplier < INT32_C(0x00400000));
+ assert(y_multiplier < INT32_C(0x00400000));
+
+ union xnn_qs8_add_params params;
+ #if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ const int32_t remainder_mask = (INT32_C(1) << shift) - INT32_C(1);
+ const int32_t remainder_threshold = (int32_t) ((uint32_t) remainder_mask >> 1);
+ const int32_t zero_point_product =
+ (int32_t) -(x_multiplier * (int32_t) x_zero_point + y_multiplier * (int32_t) y_zero_point);
+ for (uint32_t i = 0; i < 4; i++) {
+ params.sse2.zero_point_product[i] = zero_point_product;
+ }
+ const uint16_t x_multiplier_lo = (uint16_t) x_multiplier;
+ const uint16_t x_multiplier_hi = (uint16_t) ((uint32_t) x_multiplier >> 16);
+ const uint16_t y_multiplier_lo = (uint16_t) y_multiplier;
+ const uint16_t y_multiplier_hi = (uint16_t) ((uint32_t) y_multiplier >> 16);
+ for (uint32_t i = 0; i < 8; i++) {
+ params.sse2.x_multiplier_lo[i] = x_multiplier_lo;
+ params.sse2.x_multiplier_hi[i] = x_multiplier_hi;
+ params.sse2.y_multiplier_lo[i] = y_multiplier_lo;
+ params.sse2.y_multiplier_hi[i] = y_multiplier_hi;
+ }
+ params.sse2.shift = shift;
+ for (uint32_t i = 0; i < 4; i++) {
+ params.sse2.remainder_mask[i] = remainder_mask;
+ params.sse2.remainder_threshold[i] = remainder_threshold;
+ }
+ for (uint32_t i = 0; i < 8; i++) {
+ params.sse2.output_zero_point[i] = (int16_t) output_zero_point;
+ params.sse2.output_min[i] = (int16_t) output_min;
+ params.sse2.output_max[i] = (int16_t) output_max;
+ }
+ #elif XNN_ARCH_ARM || XNN_ARCH_ARM64
+ params.neon.x_zero_point = x_zero_point;
+ params.neon.y_zero_point = y_zero_point;
+ params.neon.output_zero_point = (int16_t) output_zero_point;
+ params.neon.x_multiplier = (int32_t) x_multiplier;
+ params.neon.y_multiplier = (int32_t) y_multiplier;
+ params.neon.right_shift = (int32_t) -shift;
+ params.neon.output_min = output_min;
+ params.neon.output_max = output_max;
+ #else
+ const int32_t remainder_mask = (INT32_C(1) << shift) - INT32_C(1);
+ const int32_t remainder_threshold = (int32_t) ((uint32_t) remainder_mask >> 1);
+ params.scalar.zero_point_product =
+ (int32_t) -(x_multiplier * (int32_t) x_zero_point + y_multiplier * (int32_t) y_zero_point);
+ params.scalar.x_multiplier = x_multiplier;
+ params.scalar.y_multiplier = y_multiplier;
+ params.scalar.remainder_mask = (int32_t) remainder_mask;
+ params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+ params.scalar.shift = shift;
+ params.scalar.output_zero_point = (int32_t) output_zero_point;
+ params.scalar.output_min = (int32_t) output_min;
+ params.scalar.output_max = (int32_t) output_max;
+ #endif
+ return params;
+}
+
+static inline union xnn_qs8_add_params xnn_init_scalar_qs8_add_params(
+ int8_t x_zero_point,
+ int8_t y_zero_point,
+ int8_t output_zero_point,
+ float x_output_scale,
+ float y_output_scale,
+ int8_t output_min,
+ int8_t output_max)
+{
+ assert(x_output_scale >= 0x1.0p-10f);
+ assert(y_output_scale >= 0x1.0p-10f);
+ assert(x_output_scale < 0x1.0p+8f);
+ assert(y_output_scale < 0x1.0p+8f);
+
+ // Compute requantization parameters.
+ const float max_output_scale = math_max_f32(x_output_scale, y_output_scale);
+ assert(max_output_scale >= 0x1.0p-10f);
+ assert(max_output_scale < 0x1.0p+8f);
+ const uint32_t max_scale_bits = fp32_to_bits(max_output_scale);
+ const int32_t max_scale_exponent = (int32_t) (max_scale_bits >> 23) - 127;
+ // Shift is in [13, 31] range.
+ const uint32_t shift = (uint32_t) (21 - max_scale_exponent);
+ assert(shift < 32);
+ assert(shift >= 13);
+
+ // Multipliers are in [0, 2**22) range, largest multiplier is in [2**21, 2**22) range.
+ const int32_t x_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(x_output_scale) + (shift << 23)));
+ const int32_t y_multiplier = (int32_t) lrintf(fp32_from_bits(fp32_to_bits(y_output_scale) + (shift << 23)));
+ assert((x_multiplier > y_multiplier ? x_multiplier : y_multiplier) >= INT32_C(0x00200000));
+ assert(x_multiplier < INT32_C(0x00400000));
+ assert(y_multiplier < INT32_C(0x00400000));
+
+ union xnn_qs8_add_params params;
+ const int32_t remainder_mask = (INT32_C(1) << shift) - INT32_C(1);
+ const int32_t remainder_threshold = (int32_t) ((uint32_t) remainder_mask >> 1);
+ params.scalar.zero_point_product =
+ (int32_t) -(x_multiplier * (int32_t) x_zero_point + y_multiplier * (int32_t) y_zero_point);
+ params.scalar.x_multiplier = x_multiplier;
+ params.scalar.y_multiplier = y_multiplier;
+ params.scalar.remainder_mask = (int32_t) remainder_mask;
+ params.scalar.remainder_threshold = (int32_t) remainder_threshold;
+ params.scalar.shift = shift;
+ params.scalar.output_zero_point = (int32_t) output_zero_point;
+ params.scalar.output_min = (int32_t) output_min;
+ params.scalar.output_max = (int32_t) output_max;
+ return params;
+}
+
static inline union xnn_qu8_requantization_params xnn_init_scalar_qu8_requantization_params(
float scale,
uint8_t zero_point,
diff --git a/src/xnnpack/params.h b/src/xnnpack/params.h
index d496e83..88f9535 100644
--- a/src/xnnpack/params.h
+++ b/src/xnnpack/params.h
@@ -395,6 +395,47 @@
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
};
+union xnn_qs8_add_params {
+ struct {
+ int32_t zero_point_product;
+ int32_t x_multiplier;
+ int32_t y_multiplier;
+ uint32_t shift;
+ int32_t remainder_mask;
+ int32_t remainder_threshold;
+ int32_t output_zero_point;
+ int32_t output_min;
+ int32_t output_max;
+ } scalar;
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ struct {
+ uint8_t x_zero_point;
+ uint8_t y_zero_point;
+ int16_t output_zero_point;
+ int32_t x_multiplier;
+ int32_t y_multiplier;
+ int32_t right_shift;
+ uint8_t output_min;
+ uint8_t output_max;
+ } neon;
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ struct {
+ XNN_ALIGN(16) int32_t zero_point_product[4];
+ XNN_ALIGN(16) uint16_t x_multiplier_lo[8];
+ XNN_ALIGN(16) uint16_t x_multiplier_hi[8];
+ XNN_ALIGN(16) uint16_t y_multiplier_lo[8];
+ XNN_ALIGN(16) uint16_t y_multiplier_hi[8];
+ XNN_ALIGN(16) int32_t remainder_mask[4];
+ XNN_ALIGN(16) int32_t remainder_threshold[4];
+ uint32_t shift;
+ XNN_ALIGN(16) int16_t output_zero_point[8];
+ XNN_ALIGN(16) int16_t output_min[8];
+ XNN_ALIGN(16) int16_t output_max[8];
+ } sse2;
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+};
+
union xnn_qu8_avgpool_params {
struct {
int32_t bias;
@@ -1459,11 +1500,18 @@
typedef void (*xnn_qu8_vadd_minmax_ukernel_function)(
size_t n,
- const uint8_t* a,
- const uint8_t* b,
- uint8_t* y,
+ const uint8_t* input_x,
+ const uint8_t* input_y,
+ uint8_t* output,
const union xnn_qu8_add_params* params);
+typedef void (*xnn_qs8_vadd_minmax_ukernel_function)(
+ size_t n,
+ const int8_t* input_x,
+ const int8_t* input_y,
+ int8_t* output,
+ const union xnn_qs8_add_params* params);
+
typedef void (*xnn_f32_vsqrt_ukernel_function)(
size_t n,
const float* x,
diff --git a/src/xnnpack/requantization.h b/src/xnnpack/requantization.h
index 1137df7..262d191 100644
--- a/src/xnnpack/requantization.h
+++ b/src/xnnpack/requantization.h
@@ -218,3 +218,27 @@
}
return (uint8_t) y;
}
+
+static inline int8_t xnn_qs8_quantize_add(
+ int8_t x, int8_t y,
+ union xnn_qs8_add_params params)
+{
+ // Multiply by factors and accumulate products.
+ int32_t acc = params.scalar.zero_point_product +
+ (int32_t) ((int32_t) x * params.scalar.x_multiplier) +
+ (int32_t) ((int32_t) y * params.scalar.y_multiplier);
+
+ // Shift right and round.
+ const int32_t rem = (acc & params.scalar.remainder_mask) - (int32_t) (acc < 0);
+ acc = asr_s32(acc, params.scalar.shift) + (int32_t) (rem > params.scalar.remainder_threshold);
+
+ // Clamp and add output zero point.
+ int32_t out = acc + params.scalar.output_zero_point;
+ if (out >= params.scalar.output_max) {
+ out = params.scalar.output_max;
+ }
+ if (out <= params.scalar.output_min) {
+ out = params.scalar.output_min;
+ }
+ return (int8_t) out;
+}
diff --git a/src/xnnpack/vadd.h b/src/xnnpack/vadd.h
index accc677..94e8a31 100644
--- a/src/xnnpack/vadd.h
+++ b/src/xnnpack/vadd.h
@@ -32,6 +32,25 @@
DECLARE_QU8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qu8_vadd_minmax_ukernel__sse2)
+#define DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(fn_name) \
+ XNN_INTERNAL void fn_name( \
+ size_t n, \
+ const int8_t* input_x, \
+ const int8_t* input_y, \
+ int8_t* output, \
+ const union xnn_qs8_add_params* params);
+
+DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8)
+DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16)
+DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24)
+DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32)
+
+DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8)
+DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16)
+DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24)
+DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32)
+
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/test/qs8-vadd-minmax.cc b/test/qs8-vadd-minmax.cc
new file mode 100644
index 0000000..8db8f84
--- /dev/null
+++ b/test/qs8-vadd-minmax.cc
@@ -0,0 +1,721 @@
+// Copyright 2019 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Auto-generated file. Do not edit!
+// Specification: test/qs8-vadd-minmax.yaml
+// Generator: tools/generate-vbinary-test.py
+
+
+#include <gtest/gtest.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/isa-checks.h>
+
+#include <xnnpack/vadd.h>
+#include "vadd-microkernel-tester.h"
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X8, batch_eq_8) {
+ TEST_REQUIRES_X86_SSE2;
+ VAddMicrokernelTester()
+ .batch_size(8)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8);
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X8, batch_div_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X8, batch_lt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X8, batch_gt_8) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X8, inplace_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X8, inplace_b) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X8, inplace_a_and_b) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X8, qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X8, qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X16, batch_eq_16) {
+ TEST_REQUIRES_X86_SSE2;
+ VAddMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16);
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X16, batch_div_16) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X16, batch_lt_16) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X16, batch_gt_16) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X16, inplace_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X16, inplace_b) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X16, inplace_a_and_b) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X16, qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X16, qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X24, batch_eq_24) {
+ TEST_REQUIRES_X86_SSE2;
+ VAddMicrokernelTester()
+ .batch_size(24)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24);
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X24, batch_div_24) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X24, batch_lt_24) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X24, batch_gt_24) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X24, inplace_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X24, inplace_b) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X24, inplace_a_and_b) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X24, qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X24, qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X32, batch_eq_32) {
+ TEST_REQUIRES_X86_SSE2;
+ VAddMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32);
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X32, batch_div_32) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X32, batch_lt_32) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X32, batch_gt_32) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X32, inplace_a) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X32, inplace_b) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X32, inplace_a_and_b) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X32, qmin) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X32, qmax) {
+ TEST_REQUIRES_X86_SSE2;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X8, batch_eq_8) {
+ TEST_REQUIRES_X86_SSE41;
+ VAddMicrokernelTester()
+ .batch_size(8)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8);
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X8, batch_div_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 16; batch_size < 80; batch_size += 8) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X8, batch_lt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X8, batch_gt_8) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X8, inplace_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X8, inplace_b) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X8, inplace_a_and_b) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X8, qmin) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X8, qmax) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 40; batch_size += 7) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X16, batch_eq_16) {
+ TEST_REQUIRES_X86_SSE41;
+ VAddMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16);
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X16, batch_div_16) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X16, batch_lt_16) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X16, batch_gt_16) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X16, inplace_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X16, inplace_b) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X16, inplace_a_and_b) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X16, qmin) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X16, qmax) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X24, batch_eq_24) {
+ TEST_REQUIRES_X86_SSE41;
+ VAddMicrokernelTester()
+ .batch_size(24)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24);
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X24, batch_div_24) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 48; batch_size < 240; batch_size += 24) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X24, batch_lt_24) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size < 24; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X24, batch_gt_24) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 25; batch_size < 48; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X24, inplace_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X24, inplace_b) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X24, inplace_a_and_b) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X24, qmin) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X24, qmax) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 120; batch_size += 23) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
+
+
+#if XNN_ARCH_X86 || XNN_ARCH_X86_64
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X32, batch_eq_32) {
+ TEST_REQUIRES_X86_SSE41;
+ VAddMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32);
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X32, batch_div_32) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X32, batch_lt_32) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X32, batch_gt_32) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X32, inplace_a) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X32, inplace_b) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X32, inplace_a_and_b) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X32, qmin) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__SSE41_MUL16_LD64_X32, qmax) {
+ TEST_REQUIRES_X86_SSE41;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32);
+ }
+ }
+#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/test/qs8-vadd-minmax.yaml b/test/qs8-vadd-minmax.yaml
new file mode 100644
index 0000000..01ff288
--- /dev/null
+++ b/test/qs8-vadd-minmax.yaml
@@ -0,0 +1,12 @@
+# Copyright 2020 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+- name: xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8
+- name: xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16
+- name: xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24
+- name: xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x32
+- name: xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x8
+- name: xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x16
+- name: xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x24
+- name: xnn_qs8_vadd_minmax_ukernel__sse41_mul16_ld64_x32
diff --git a/test/qu8-vadd-minmax.cc b/test/qu8-vadd-minmax.cc
index d37df9a..2f126df 100644
--- a/test/qu8-vadd-minmax.cc
+++ b/test/qu8-vadd-minmax.cc
@@ -19,43 +19,43 @@
TEST(QU8_VADD_MINMAX__SSE2, n_eq_8) {
TEST_REQUIRES_X86_SSE2;
VAddMicrokernelTester()
- .n(8)
+ .batch_size(8)
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
TEST(QU8_VADD_MINMAX__SSE2, n_div_8) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 8; n < 128; n += 24) {
+ for (size_t batch_size = 8; batch_size < 128; batch_size += 24) {
VAddMicrokernelTester()
- .n(n)
+ .batch_size(batch_size)
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
}
TEST(QU8_VADD_MINMAX__SSE2, n_gt_8) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 9; n < 16; n++) {
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VAddMicrokernelTester()
- .n(n)
+ .batch_size(batch_size)
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
}
TEST(QU8_VADD_MINMAX__SSE2, n_lt_8) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n < 8; n++) {
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VAddMicrokernelTester()
- .n(n)
+ .batch_size(batch_size)
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
}
TEST(QU8_VADD_MINMAX__SSE2, inplace_a) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.inplace_a(true)
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
@@ -63,10 +63,10 @@
TEST(QU8_VADD_MINMAX__SSE2, inplace_b) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.inplace_b(true)
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
@@ -74,10 +74,10 @@
TEST(QU8_VADD_MINMAX__SSE2, inplace_a_and_b) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.inplace_a(true)
.inplace_b(true)
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
@@ -86,11 +86,11 @@
TEST(QU8_VADD_MINMAX__SSE2, a_scale) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
for (float a_scale = 1.0e-2; a_scale < 1.0e+2; a_scale *= 1.7f) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.a_scale(a_scale)
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
@@ -99,11 +99,11 @@
TEST(QU8_VADD_MINMAX__SSE2, b_scale) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
for (float b_scale = 1.0e-2; b_scale < 1.0e+2; b_scale *= 1.7f) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.b_scale(b_scale)
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
@@ -112,11 +112,11 @@
TEST(QU8_VADD_MINMAX__SSE2, y_scale) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
for (float y_scale = 1.0e-2; y_scale < 1.0e+2; y_scale *= 1.7f) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.y_scale(y_scale)
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
@@ -125,11 +125,11 @@
TEST(QU8_VADD_MINMAX__SSE2, a_zero_point) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
for (int32_t a_zero_point = 0; a_zero_point <= 255; a_zero_point += 51) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.a_zero_point(uint8_t(a_zero_point))
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
@@ -138,11 +138,11 @@
TEST(QU8_VADD_MINMAX__SSE2, b_zero_point) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
for (int32_t b_zero_point = 0; b_zero_point <= 255; b_zero_point += 51) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.b_zero_point(uint8_t(b_zero_point))
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
@@ -151,11 +151,11 @@
TEST(QU8_VADD_MINMAX__SSE2, y_zero_point) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
for (int32_t y_zero_point = 0; y_zero_point <= 255; y_zero_point += 51) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.y_zero_point(uint8_t(y_zero_point))
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
@@ -164,10 +164,10 @@
TEST(QU8_VADD_MINMAX__SSE2, qmin) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.qmin(128)
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
@@ -175,10 +175,10 @@
TEST(QU8_VADD_MINMAX__SSE2, qmax) {
TEST_REQUIRES_X86_SSE2;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.qmax(128)
.Test(xnn_qu8_vadd_minmax_ukernel__sse2);
}
@@ -189,43 +189,43 @@
TEST(QU8_VADD_MINMAX__NEON, n_eq_8) {
TEST_REQUIRES_ARM_NEON;
VAddMicrokernelTester()
- .n(8)
+ .batch_size(8)
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
TEST(QU8_VADD_MINMAX__NEON, n_div_8) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 8; n < 128; n += 24) {
+ for (size_t batch_size = 8; batch_size < 128; batch_size += 24) {
VAddMicrokernelTester()
- .n(n)
+ .batch_size(batch_size)
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
}
TEST(QU8_VADD_MINMAX__NEON, n_gt_8) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 9; n < 16; n++) {
+ for (size_t batch_size = 9; batch_size < 16; batch_size++) {
VAddMicrokernelTester()
- .n(n)
+ .batch_size(batch_size)
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
}
TEST(QU8_VADD_MINMAX__NEON, n_lt_8) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 1; n < 8; n++) {
+ for (size_t batch_size = 1; batch_size < 8; batch_size++) {
VAddMicrokernelTester()
- .n(n)
+ .batch_size(batch_size)
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
}
TEST(QU8_VADD_MINMAX__NEON, inplace_a) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.inplace_a(true)
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
@@ -233,10 +233,10 @@
TEST(QU8_VADD_MINMAX__NEON, inplace_b) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.inplace_b(true)
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
@@ -244,10 +244,10 @@
TEST(QU8_VADD_MINMAX__NEON, inplace_a_and_b) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.inplace_a(true)
.inplace_b(true)
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
@@ -256,11 +256,11 @@
TEST(QU8_VADD_MINMAX__NEON, a_scale) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
for (float a_scale = 1.0e-2; a_scale < 1.0e+2; a_scale *= 1.7f) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.a_scale(a_scale)
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
@@ -269,11 +269,11 @@
TEST(QU8_VADD_MINMAX__NEON, b_scale) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
for (float b_scale = 1.0e-2; b_scale < 1.0e+2; b_scale *= 1.7f) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.b_scale(b_scale)
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
@@ -282,11 +282,11 @@
TEST(QU8_VADD_MINMAX__NEON, y_scale) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
for (float y_scale = 1.0e-2; y_scale < 1.0e+2; y_scale *= 1.7f) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.y_scale(y_scale)
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
@@ -295,11 +295,11 @@
TEST(QU8_VADD_MINMAX__NEON, a_zero_point) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
for (int32_t a_zero_point = 0; a_zero_point <= 255; a_zero_point += 51) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.a_zero_point(uint8_t(a_zero_point))
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
@@ -308,11 +308,11 @@
TEST(QU8_VADD_MINMAX__NEON, b_zero_point) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
for (int32_t b_zero_point = 0; b_zero_point <= 255; b_zero_point += 51) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.b_zero_point(uint8_t(b_zero_point))
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
@@ -321,11 +321,11 @@
TEST(QU8_VADD_MINMAX__NEON, y_zero_point) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
for (int32_t y_zero_point = 0; y_zero_point <= 255; y_zero_point += 51) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.y_zero_point(uint8_t(y_zero_point))
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
@@ -334,10 +334,10 @@
TEST(QU8_VADD_MINMAX__NEON, qmin) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.qmin(128)
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
@@ -345,10 +345,10 @@
TEST(QU8_VADD_MINMAX__NEON, qmax) {
TEST_REQUIRES_ARM_NEON;
- for (size_t n = 1; n < 128; n += 11) {
+ for (size_t batch_size = 1; batch_size < 128; batch_size += 11) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.qmax(128)
.Test(xnn_qu8_vadd_minmax_ukernel__neon);
}
@@ -357,43 +357,43 @@
TEST(QU8_VADD_MINMAX__SCALAR, n_eq_1) {
VAddMicrokernelTester()
- .n(1)
+ .batch_size(1)
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
}
TEST(QU8_VADD_MINMAX__SCALAR, n_gt_1) {
- for (size_t n = 2; n < 8; n++) {
+ for (size_t batch_size = 2; batch_size < 8; batch_size++) {
VAddMicrokernelTester()
- .n(n)
+ .batch_size(batch_size)
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
}
}
TEST(QU8_VADD_MINMAX__SCALAR, inplace_a) {
- for (size_t n = 1; n < 16; n += 3) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size += 3) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.inplace_a(true)
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
}
}
TEST(QU8_VADD_MINMAX__SCALAR, inplace_b) {
- for (size_t n = 1; n < 16; n += 3) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size += 3) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.inplace_b(true)
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
}
}
TEST(QU8_VADD_MINMAX__SCALAR, inplace_a_and_b) {
- for (size_t n = 1; n < 16; n += 3) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size += 3) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.inplace_a(true)
.inplace_b(true)
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
@@ -401,11 +401,11 @@
}
TEST(QU8_VADD_MINMAX__SCALAR, a_scale) {
- for (size_t n = 1; n < 16; n += 3) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size += 3) {
for (float a_scale = 1.0e-2; a_scale < 1.0e+2; a_scale *= 1.7f) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.a_scale(a_scale)
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
}
@@ -413,11 +413,11 @@
}
TEST(QU8_VADD_MINMAX__SCALAR, b_scale) {
- for (size_t n = 1; n < 16; n += 3) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size += 3) {
for (float b_scale = 1.0e-2; b_scale < 1.0e+2; b_scale *= 1.7f) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.b_scale(b_scale)
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
}
@@ -425,11 +425,11 @@
}
TEST(QU8_VADD_MINMAX__SCALAR, y_scale) {
- for (size_t n = 1; n < 16; n += 3) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size += 3) {
for (float y_scale = 1.0e-2; y_scale < 1.0e+2; y_scale *= 1.7f) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.y_scale(y_scale)
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
}
@@ -437,11 +437,11 @@
}
TEST(QU8_VADD_MINMAX__SCALAR, a_zero_point) {
- for (size_t n = 1; n < 16; n += 3) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size += 3) {
for (int32_t a_zero_point = 0; a_zero_point <= 255; a_zero_point += 51) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.a_zero_point(uint8_t(a_zero_point))
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
}
@@ -449,11 +449,11 @@
}
TEST(QU8_VADD_MINMAX__SCALAR, b_zero_point) {
- for (size_t n = 1; n < 16; n += 3) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size += 3) {
for (int32_t b_zero_point = 0; b_zero_point <= 255; b_zero_point += 51) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.b_zero_point(uint8_t(b_zero_point))
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
}
@@ -461,11 +461,11 @@
}
TEST(QU8_VADD_MINMAX__SCALAR, y_zero_point) {
- for (size_t n = 1; n < 16; n += 3) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size += 3) {
for (int32_t y_zero_point = 0; y_zero_point <= 255; y_zero_point += 51) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.y_zero_point(uint8_t(y_zero_point))
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
}
@@ -473,20 +473,20 @@
}
TEST(QU8_VADD_MINMAX__SCALAR, qmin) {
- for (size_t n = 1; n < 16; n += 3) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size += 3) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.qmin(128)
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
}
}
TEST(QU8_VADD_MINMAX__SCALAR, qmax) {
- for (size_t n = 1; n < 16; n += 3) {
+ for (size_t batch_size = 1; batch_size < 16; batch_size += 3) {
VAddMicrokernelTester()
.iterations(1)
- .n(n)
+ .batch_size(batch_size)
.qmax(128)
.Test(xnn_qu8_vadd_minmax_ukernel__scalar, VAddMicrokernelTester::Variant::Scalar);
}
diff --git a/test/vadd-microkernel-tester.h b/test/vadd-microkernel-tester.h
index a98c448..55ed370 100644
--- a/test/vadd-microkernel-tester.h
+++ b/test/vadd-microkernel-tester.h
@@ -32,14 +32,14 @@
Scalar,
};
- inline VAddMicrokernelTester& n(size_t n) {
- assert(n != 0);
- this->n_ = n;
+ inline VAddMicrokernelTester& batch_size(size_t batch_size) {
+ assert(batch_size != 0);
+ this->batch_size_ = batch_size;
return *this;
}
- inline size_t n() const {
- return this->n_;
+ inline size_t batch_size() const {
+ return this->batch_size_;
}
inline VAddMicrokernelTester& inplace_a(bool inplace_a) {
@@ -152,11 +152,11 @@
auto rng = std::mt19937(random_device());
auto u8rng = std::bind(std::uniform_int_distribution<uint32_t>(0, std::numeric_limits<uint8_t>::max()), rng);
- std::vector<uint8_t> a(n() + XNN_EXTRA_BYTES / sizeof(uint8_t));
- std::vector<uint8_t> b(n() + XNN_EXTRA_BYTES / sizeof(uint8_t));
- std::vector<uint8_t> y(n() + (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(uint8_t) : 0));
- std::vector<float> y_fp(n());
- std::vector<uint8_t> y_ref(n());
+ std::vector<uint8_t> a(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t));
+ std::vector<uint8_t> b(batch_size() + XNN_EXTRA_BYTES / sizeof(uint8_t));
+ std::vector<uint8_t> y(batch_size() + (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(uint8_t) : 0));
+ std::vector<float> y_fp(batch_size());
+ std::vector<uint8_t> y_ref(batch_size());
for (size_t iteration = 0; iteration < iterations(); iteration++) {
std::generate(a.begin(), a.end(), std::ref(u8rng));
std::generate(b.begin(), b.end(), std::ref(u8rng));
@@ -191,7 +191,7 @@
qmin(), qmax());
// Compute reference results.
- for (size_t i = 0; i < n(); i++) {
+ for (size_t i = 0; i < batch_size(); i++) {
y_fp[i] = float(y_zero_point()) +
float(int32_t(a_data[i]) - int32_t(a_zero_point())) * (a_scale() / y_scale()) +
float(int32_t(b_data[i]) - int32_t(b_zero_point())) * (b_scale() / y_scale());
@@ -201,24 +201,95 @@
}
// Call optimized micro-kernel.
- vadd_minmax(n(), a_data, b_data, y.data(), &quantization_params);
+ vadd_minmax(batch_size(), a_data, b_data, y.data(), &quantization_params);
// Verify results.
- for (size_t i = 0; i < n(); i++) {
+ for (size_t i = 0; i < batch_size(); i++) {
ASSERT_LE(uint32_t(y[i]), uint32_t(qmax()))
- << "at " << i << ", n = " << n();
+ << "at element " << i << " / " << batch_size();
ASSERT_GE(uint32_t(y[i]), uint32_t(qmin()))
- << "at " << i << ", n = " << n();
+ << "at element " << i << " / " << batch_size();
ASSERT_NEAR(float(int32_t(y[i])), y_fp[i], 0.6f)
- << "at " << i << ", n = " << n();
+ << "at element " << i << " / " << batch_size();
ASSERT_EQ(uint32_t(y_ref[i]), uint32_t(y[i]))
- << "at " << i << ", n = " << n();
+ << "at element " << i << " / " << batch_size();
+ }
+ }
+ }
+
+ void Test(xnn_qs8_vadd_minmax_ukernel_function vadd_minmax, Variant variant = Variant::Native) const {
+ std::random_device random_device;
+ auto rng = std::mt19937(random_device());
+ auto i8rng = std::bind(
+ std::uniform_int_distribution<int32_t>(std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max()), rng);
+
+ std::vector<int8_t> a(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t));
+ std::vector<int8_t> b(batch_size() + XNN_EXTRA_BYTES / sizeof(int8_t));
+ std::vector<int8_t> y(batch_size() + (inplace_a() || inplace_b() ? XNN_EXTRA_BYTES / sizeof(int8_t) : 0));
+ std::vector<float> y_fp(batch_size());
+ std::vector<int8_t> y_ref(batch_size());
+ for (size_t iteration = 0; iteration < iterations(); iteration++) {
+ std::generate(a.begin(), a.end(), std::ref(i8rng));
+ std::generate(b.begin(), b.end(), std::ref(i8rng));
+ if (inplace_a() || inplace_b()) {
+ std::generate(y.begin(), y.end(), std::ref(i8rng));
+ } else {
+ std::fill(y.begin(), y.end(), 0xA5);
+ }
+ const int8_t* a_data = inplace_a() ? y.data() : a.data();
+ const int8_t* b_data = inplace_b() ? y.data() : b.data();
+
+ // Prepare parameters.
+ xnn_qs8_add_params quantization_params = { };
+ switch (variant) {
+ case Variant::Native:
+ quantization_params = xnn_init_qs8_add_params(
+ int8_t(a_zero_point() - 0x80), int8_t(b_zero_point() - 0x80), int8_t(y_zero_point() - 0x80),
+ a_scale() / y_scale(), b_scale() / y_scale(),
+ int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
+ break;
+ case Variant::Scalar:
+ quantization_params = xnn_init_scalar_qs8_add_params(
+ int8_t(a_zero_point() - 0x80), int8_t(b_zero_point() - 0x80), int8_t(y_zero_point() - 0x80),
+ a_scale() / y_scale(), b_scale() / y_scale(),
+ int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
+ break;
+ }
+ const xnn_qs8_add_params scalar_quantization_params =
+ xnn_init_scalar_qs8_add_params(
+ int8_t(a_zero_point() - 0x80), int8_t(b_zero_point() - 0x80), int8_t(y_zero_point() - 0x80),
+ a_scale() / y_scale(), b_scale() / y_scale(),
+ int8_t(qmin() - 0x80), int8_t(qmax() - 0x80));
+
+ // Compute reference results.
+ for (size_t i = 0; i < batch_size(); i++) {
+ y_fp[i] = float(int32_t(y_zero_point() - 0x80)) +
+ float(int32_t(a_data[i]) - int32_t(a_zero_point() - 0x80)) * (a_scale() / y_scale()) +
+ float(int32_t(b_data[i]) - int32_t(b_zero_point() - 0x80)) * (b_scale() / y_scale());
+ y_fp[i] = std::min<float>(y_fp[i], float(int32_t(qmax() - 0x80)));
+ y_fp[i] = std::max<float>(y_fp[i], float(int32_t(qmin() - 0x80)));
+ y_ref[i] = xnn_qs8_quantize_add(a_data[i], b_data[i], scalar_quantization_params);
+ }
+
+ // Call optimized micro-kernel.
+ vadd_minmax(batch_size(), a_data, b_data, y.data(), &quantization_params);
+
+ // Verify results.
+ for (size_t i = 0; i < batch_size(); i++) {
+ ASSERT_LE(int32_t(y[i]), int32_t(qmax() - 0x80))
+ << "at element " << i << " / " << batch_size();
+ ASSERT_GE(int32_t(y[i]), int32_t(qmin() - 0x80))
+ << "at element " << i << " / " << batch_size();
+ ASSERT_EQ(int32_t(y_ref[i]), int32_t(y[i]))
+ << "at element " << i << " / " << batch_size();
+ ASSERT_NEAR(float(int32_t(y[i])), y_fp[i], 0.6f)
+ << "at element " << i << " / " << batch_size();
}
}
}
private:
- size_t n_{1};
+ size_t batch_size_{1};
bool inplace_a_{false};
bool inplace_b_{false};
float a_scale_{0.75f};
diff --git a/tools/generate-vbinary-test.py b/tools/generate-vbinary-test.py
index f675855..bd77e44 100755
--- a/tools/generate-vbinary-test.py
+++ b/tools/generate-vbinary-test.py
@@ -19,6 +19,10 @@
parser = argparse.ArgumentParser(
description='Vector binary operation microkernel test generator')
+parser.add_argument("-t", "--tester", metavar="TESTER", required=True,
+ choices=["VAddMicrokernelTester",
+ "VBinOpMicrokernelTester", "VBinOpCMicrokernelTester"],
+ help="Tester class to be used in the generated test")
parser.add_argument("-s", "--spec", metavar="FILE", required=True,
help="Specification (YAML) file")
parser.add_argument("-o", "--output", metavar="FILE", required=True,
@@ -27,7 +31,7 @@
def split_ukernel_name(name):
- match = re.match(r"^xnn_(f16|f32)_v(add|div|max|min|mul|sqrdiff|sub|addc|divc|rdivc|maxc|minc|mulc|sqrdiffc|subc|rsubc)(_(minmax|relu))?_ukernel__(.+)_x(\d+)$", name)
+ match = re.match(r"^xnn_(qs8|f16|f32)_v(add|div|max|min|mul|sqrdiff|sub|addc|divc|rdivc|maxc|minc|mulc|sqrdiffc|subc|rsubc)(_(minmax|relu))?_ukernel__(.+)_x(\d+)$", name)
if match is None:
raise ValueError("Unexpected microkernel name: " + name)
op_type = {
@@ -171,13 +175,14 @@
"""
-def generate_test_cases(ukernel, op_type, activation_type, batch_tile, isa):
+def generate_test_cases(ukernel, op_type, activation_type, tester, batch_tile, isa):
"""Generates all tests cases for a Vector Binary Operation micro-kernel.
Args:
ukernel: C name of the micro-kernel function.
op_type: Operation type (ADD/MUL/SUB/etc).
activation_type: Activation type (LINEAR/MINMAX/RELU).
+ tester: C++ name of the tester class.
batch_tile: Number of batch elements processed per one iteration of the
inner loop of the micro-kernel.
isa: instruction set required to run the micro-kernel. Generated unit test
@@ -188,11 +193,9 @@
"""
_, test_name = ukernel.split("_", 1)
_, datatype, _ = ukernel.split("_", 2)
- tester = "VBinOp%sMicrokernelTester" % ("C" if op_type.endswith("C") else "")
- test_args = [
- ukernel,
- "%s::OpType::%s" % (tester, op_type),
- ]
+ test_args = [ukernel]
+ if tester in ["VBinOpMicrokernelTester", "VBinOpCMicrokernelTester"]:
+ test_args.append("%s::OpType::%s" % (tester, op_type))
if not isa or isa == "psimd":
test_args.append("%s::Variant::Scalar" % tester)
return xngen.preprocess(BINOP_TEST_TEMPLATE, {
@@ -216,11 +219,16 @@
raise ValueError("expected a list of micro-kernels in the spec")
spec_name = os.path.splitext(os.path.split(options.spec)[1])[0]
- opname = spec_name.split("-")[1]
- if opname.endswith("c"):
- header = "vbinaryc-microkernel-tester.h"
- else:
- header = "vbinary-microkernel-tester.h"
+ microkernel_header = {
+ "VAddMicrokernelTester": "xnnpack/vadd.h",
+ "VBinOpMicrokernelTester": "xnnpack/vbinary.h",
+ "VBinOpCMicrokernelTester": "xnnpack/vbinary.h",
+ }[options.tester]
+ tester_header = {
+ "VAddMicrokernelTester": "vadd-microkernel-tester.h",
+ "VBinOpMicrokernelTester": "vbinary-microkernel-tester.h",
+ "VBinOpCMicrokernelTester": "vbinaryc-microkernel-tester.h",
+ }[options.tester]
tests = """\
// Copyright 2019 Google LLC
//
@@ -237,9 +245,10 @@
#include <xnnpack/common.h>
#include <xnnpack/isa-checks.h>
-#include <xnnpack/vbinary.h>
-#include "{header}"
-""".format(specification=options.spec, generator=sys.argv[0], header=header)
+#include <{microkernel_header}>
+#include "{tester_header}"
+""".format(specification=options.spec, generator=sys.argv[0],
+ microkernel_header=microkernel_header, tester_header=tester_header)
for ukernel_spec in spec_yaml:
name = ukernel_spec["name"]
@@ -249,7 +258,7 @@
arch = ukernel_spec.get("arch", arch)
test_case = generate_test_cases(name, op_type, activation_type,
- batch_tile, isa)
+ options.tester, batch_tile, isa)
tests += "\n\n" + xnncommon.postprocess_test_case(test_case, arch, isa)
with codecs.open(options.output, "w", encoding="utf-8") as output_file: