LD128 versions of QS8/QU8 VADD[C] NEON microkernels
PiperOrigin-RevId: 387848961
diff --git a/BUILD.bazel b/BUILD.bazel
index a234be3..acb6048 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -2481,10 +2481,14 @@
"src/qs8-vadd/gen/minmax-neon-ld64-x16.c",
"src/qs8-vadd/gen/minmax-neon-ld64-x24.c",
"src/qs8-vadd/gen/minmax-neon-ld64-x32.c",
+ "src/qs8-vadd/gen/minmax-neon-ld128-x16.c",
+ "src/qs8-vadd/gen/minmax-neon-ld128-x32.c",
"src/qs8-vaddc/gen/minmax-neon-ld64-x8.c",
"src/qs8-vaddc/gen/minmax-neon-ld64-x16.c",
"src/qs8-vaddc/gen/minmax-neon-ld64-x24.c",
"src/qs8-vaddc/gen/minmax-neon-ld64-x32.c",
+ "src/qs8-vaddc/gen/minmax-neon-ld128-x16.c",
+ "src/qs8-vaddc/gen/minmax-neon-ld128-x32.c",
"src/qu8-avgpool/9p8x-minmax-neon-c8.c",
"src/qu8-avgpool/9x-minmax-neon-c8.c",
"src/qu8-dwconv/gen/up8x9-minmax-fp32-neon-mul16.c",
@@ -2518,8 +2522,10 @@
"src/qu8-requantization/rndna-neon.c",
"src/qu8-vadd/gen/minmax-neon-ld64-x8.c",
"src/qu8-vadd/gen/minmax-neon-ld64-x16.c",
+ "src/qu8-vadd/gen/minmax-neon-ld128-x16.c",
"src/qu8-vaddc/gen/minmax-neon-ld64-x8.c",
"src/qu8-vaddc/gen/minmax-neon-ld64-x16.c",
+ "src/qu8-vaddc/gen/minmax-neon-ld128-x16.c",
"src/u8-maxpool/9p8x-minmax-neon-c16.c",
"src/u8-rmax/neon.c",
"src/u8-vclamp/neon-x64.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0edf225..4583316 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1668,10 +1668,14 @@
src/qs8-vadd/gen/minmax-neon-ld64-x16.c
src/qs8-vadd/gen/minmax-neon-ld64-x24.c
src/qs8-vadd/gen/minmax-neon-ld64-x32.c
+ src/qs8-vadd/gen/minmax-neon-ld128-x16.c
+ src/qs8-vadd/gen/minmax-neon-ld128-x32.c
src/qs8-vaddc/gen/minmax-neon-ld64-x8.c
src/qs8-vaddc/gen/minmax-neon-ld64-x16.c
src/qs8-vaddc/gen/minmax-neon-ld64-x24.c
src/qs8-vaddc/gen/minmax-neon-ld64-x32.c
+ src/qs8-vaddc/gen/minmax-neon-ld128-x16.c
+ src/qs8-vaddc/gen/minmax-neon-ld128-x32.c
src/qu8-avgpool/9p8x-minmax-neon-c8.c
src/qu8-avgpool/9x-minmax-neon-c8.c
src/qu8-dwconv/gen/up8x9-minmax-fp32-neon-mul16.c
@@ -1705,8 +1709,10 @@
src/qu8-requantization/rndna-neon.c
src/qu8-vadd/gen/minmax-neon-ld64-x8.c
src/qu8-vadd/gen/minmax-neon-ld64-x16.c
+ src/qu8-vadd/gen/minmax-neon-ld128-x16.c
src/qu8-vaddc/gen/minmax-neon-ld64-x8.c
src/qu8-vaddc/gen/minmax-neon-ld64-x16.c
+ src/qu8-vaddc/gen/minmax-neon-ld128-x16.c
src/u8-maxpool/9p8x-minmax-neon-c16.c
src/u8-rmax/neon.c
src/u8-vclamp/neon-x64.c
diff --git a/bench/qs8-vadd.cc b/bench/qs8-vadd.cc
index 0823874..3f8fbcc 100644
--- a/bench/qs8-vadd.cc
+++ b/bench/qs8-vadd.cc
@@ -91,6 +91,19 @@
benchmark::utils::CheckNEON)
->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
->UseRealTime();
+
+ BENCHMARK_CAPTURE(qs8_vadd, neon_ld128_x16,
+ xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16,
+ xnn_init_qs8_add_minmax_neon_params,
+ benchmark::utils::CheckNEON)
+ ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qs8_vadd, neon_ld128_x32,
+ xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32,
+ xnn_init_qs8_add_minmax_neon_params,
+ benchmark::utils::CheckNEON)
+ ->Apply(benchmark::utils::BinaryElementwiseParameters<int8_t, int8_t>)
+ ->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/bench/qs8-vaddc.cc b/bench/qs8-vaddc.cc
index 2d40bde..a0de142 100644
--- a/bench/qs8-vaddc.cc
+++ b/bench/qs8-vaddc.cc
@@ -90,6 +90,19 @@
benchmark::utils::CheckNEON)
->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
->UseRealTime();
+
+ BENCHMARK_CAPTURE(qs8_vaddc, neon_ld128_x16,
+ xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16,
+ xnn_init_qs8_add_minmax_neon_params,
+ benchmark::utils::CheckNEON)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+ ->UseRealTime();
+ BENCHMARK_CAPTURE(qs8_vaddc, neon_ld128_x32,
+ xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32,
+ xnn_init_qs8_add_minmax_neon_params,
+ benchmark::utils::CheckNEON)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<int8_t, int8_t>)
+ ->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/bench/qu8-vadd.cc b/bench/qu8-vadd.cc
index a9378c3..8ab2618 100644
--- a/bench/qu8-vadd.cc
+++ b/bench/qu8-vadd.cc
@@ -79,6 +79,13 @@
benchmark::utils::CheckNEON)
->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
->UseRealTime();
+
+ BENCHMARK_CAPTURE(qu8_vadd, neon_ld128_x16,
+ xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16,
+ xnn_init_qu8_add_minmax_neon_params,
+ benchmark::utils::CheckNEON)
+ ->Apply(benchmark::utils::BinaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/bench/qu8-vaddc.cc b/bench/qu8-vaddc.cc
index 119508b..c082d07 100644
--- a/bench/qu8-vaddc.cc
+++ b/bench/qu8-vaddc.cc
@@ -78,6 +78,13 @@
benchmark::utils::CheckNEON)
->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
->UseRealTime();
+
+ BENCHMARK_CAPTURE(qu8_vaddc, neon_ld128_x16,
+ xnn_qu8_vaddc_minmax_ukernel__neon_ld128_x16,
+ xnn_init_qu8_add_minmax_neon_params,
+ benchmark::utils::CheckNEON)
+ ->Apply(benchmark::utils::UnaryElementwiseParameters<uint8_t, uint8_t>)
+ ->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
diff --git a/scripts/generate-qs8-vadd.sh b/scripts/generate-qs8-vadd.sh
index 981e55e..6a34812 100755
--- a/scripts/generate-qs8-vadd.sh
+++ b/scripts/generate-qs8-vadd.sh
@@ -39,21 +39,31 @@
tools/xngen src/qs8-vaddc/wasmsimd.c.in -D BATCH_TILE=16 -D DATATYPE=QU8 -o src/qu8-vaddc/gen/minmax-wasmsimd-x16.c
################################### ARM NEON ##################################
-tools/xngen src/qs8-vadd/neon-ld64.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-vadd/gen/minmax-neon-ld64-x8.c
-tools/xngen src/qs8-vadd/neon-ld64.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/qs8-vadd/gen/minmax-neon-ld64-x16.c
-tools/xngen src/qs8-vadd/neon-ld64.c.in -D BATCH_TILE=24 -D DATATYPE=QS8 -o src/qs8-vadd/gen/minmax-neon-ld64-x24.c
-tools/xngen src/qs8-vadd/neon-ld64.c.in -D BATCH_TILE=32 -D DATATYPE=QS8 -o src/qs8-vadd/gen/minmax-neon-ld64-x32.c
+tools/xngen src/qs8-vadd/neon.c.in -D BATCH_TILE=8 -D LD128=0 -D DATATYPE=QS8 -o src/qs8-vadd/gen/minmax-neon-ld64-x8.c
+tools/xngen src/qs8-vadd/neon.c.in -D BATCH_TILE=16 -D LD128=0 -D DATATYPE=QS8 -o src/qs8-vadd/gen/minmax-neon-ld64-x16.c
+tools/xngen src/qs8-vadd/neon.c.in -D BATCH_TILE=24 -D LD128=0 -D DATATYPE=QS8 -o src/qs8-vadd/gen/minmax-neon-ld64-x24.c
+tools/xngen src/qs8-vadd/neon.c.in -D BATCH_TILE=32 -D LD128=0 -D DATATYPE=QS8 -o src/qs8-vadd/gen/minmax-neon-ld64-x32.c
-tools/xngen src/qs8-vadd/neon-ld64.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-vadd/gen/minmax-neon-ld64-x8.c
-tools/xngen src/qs8-vadd/neon-ld64.c.in -D BATCH_TILE=16 -D DATATYPE=QU8 -o src/qu8-vadd/gen/minmax-neon-ld64-x16.c
+tools/xngen src/qs8-vadd/neon.c.in -D BATCH_TILE=16 -D LD128=1 -D DATATYPE=QS8 -o src/qs8-vadd/gen/minmax-neon-ld128-x16.c
+tools/xngen src/qs8-vadd/neon.c.in -D BATCH_TILE=32 -D LD128=1 -D DATATYPE=QS8 -o src/qs8-vadd/gen/minmax-neon-ld128-x32.c
-tools/xngen src/qs8-vaddc/neon-ld64.c.in -D BATCH_TILE=8 -D DATATYPE=QS8 -o src/qs8-vaddc/gen/minmax-neon-ld64-x8.c
-tools/xngen src/qs8-vaddc/neon-ld64.c.in -D BATCH_TILE=16 -D DATATYPE=QS8 -o src/qs8-vaddc/gen/minmax-neon-ld64-x16.c
-tools/xngen src/qs8-vaddc/neon-ld64.c.in -D BATCH_TILE=24 -D DATATYPE=QS8 -o src/qs8-vaddc/gen/minmax-neon-ld64-x24.c
-tools/xngen src/qs8-vaddc/neon-ld64.c.in -D BATCH_TILE=32 -D DATATYPE=QS8 -o src/qs8-vaddc/gen/minmax-neon-ld64-x32.c
+tools/xngen src/qs8-vadd/neon.c.in -D BATCH_TILE=8 -D LD128=0 -D DATATYPE=QU8 -o src/qu8-vadd/gen/minmax-neon-ld64-x8.c
+tools/xngen src/qs8-vadd/neon.c.in -D BATCH_TILE=16 -D LD128=0 -D DATATYPE=QU8 -o src/qu8-vadd/gen/minmax-neon-ld64-x16.c
-tools/xngen src/qs8-vaddc/neon-ld64.c.in -D BATCH_TILE=8 -D DATATYPE=QU8 -o src/qu8-vaddc/gen/minmax-neon-ld64-x8.c
-tools/xngen src/qs8-vaddc/neon-ld64.c.in -D BATCH_TILE=16 -D DATATYPE=QU8 -o src/qu8-vaddc/gen/minmax-neon-ld64-x16.c
+tools/xngen src/qs8-vadd/neon.c.in -D BATCH_TILE=16 -D LD128=1 -D DATATYPE=QU8 -o src/qu8-vadd/gen/minmax-neon-ld128-x16.c
+
+tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=8 -D LD128=0 -D DATATYPE=QS8 -o src/qs8-vaddc/gen/minmax-neon-ld64-x8.c
+tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=16 -D LD128=0 -D DATATYPE=QS8 -o src/qs8-vaddc/gen/minmax-neon-ld64-x16.c
+tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=24 -D LD128=0 -D DATATYPE=QS8 -o src/qs8-vaddc/gen/minmax-neon-ld64-x24.c
+tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=32 -D LD128=0 -D DATATYPE=QS8 -o src/qs8-vaddc/gen/minmax-neon-ld64-x32.c
+
+tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=16 -D LD128=1 -D DATATYPE=QS8 -o src/qs8-vaddc/gen/minmax-neon-ld128-x16.c
+tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=32 -D LD128=1 -D DATATYPE=QS8 -o src/qs8-vaddc/gen/minmax-neon-ld128-x32.c
+
+tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=8 -D LD128=0 -D DATATYPE=QU8 -o src/qu8-vaddc/gen/minmax-neon-ld64-x8.c
+tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=16 -D LD128=0 -D DATATYPE=QU8 -o src/qu8-vaddc/gen/minmax-neon-ld64-x16.c
+
+tools/xngen src/qs8-vaddc/neon.c.in -D BATCH_TILE=16 -D LD128=1 -D DATATYPE=QU8 -o src/qu8-vaddc/gen/minmax-neon-ld128-x16.c
################################### x86 SSE ###################################
tools/xngen src/qs8-vadd/sse-mul16-ld64.c.in -D BATCH_TILE=8 -D SSE=2 -D AVX=0 -D XOP=0 -D DATATYPE=QS8 -o src/qs8-vadd/gen/minmax-sse2-mul16-ld64-x8.c
diff --git a/src/qs8-vadd/gen/minmax-neon-ld128-x16.c b/src/qs8-vadd/gen/minmax-neon-ld128-x16.c
new file mode 100644
index 0000000..6f1e411
--- /dev/null
+++ b/src/qs8-vadd/gen/minmax-neon-ld128-x16.c
@@ -0,0 +1,127 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vadd/neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16(
+ size_t n,
+ const int8_t* input_a,
+ const int8_t* input_b,
+ int8_t* output,
+ const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ #if XNN_ARCH_ARM64
+ const int8x16_t va_zero_point = vld1q_dup_s8(¶ms->neon.a_zero_point);
+ const int8x16_t vb_zero_point = vld1q_dup_s8(¶ms->neon.b_zero_point);
+ #else
+ const int8x8_t va_zero_point = vld1_dup_s8(¶ms->neon.a_zero_point);
+ const int8x8_t vb_zero_point = vld1_dup_s8(¶ms->neon.b_zero_point);
+ #endif
+ const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier);
+ const int32x4_t vb_multiplier = vld1q_dup_s32(¶ms->neon.b_multiplier);
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
+ const int8x16_t va0123456789ABCDEF = vld1q_s8(input_a); input_a += 16;
+ const int8x16_t vb0123456789ABCDEF = vld1q_s8(input_b); input_b += 16;
+
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(vget_low_s8(va0123456789ABCDEF), vget_low_s8(va_zero_point));
+ const int16x8_t vxa89ABCDEF = vsubl_high_s8(va0123456789ABCDEF, va_zero_point);
+ const int16x8_t vxb01234567 = vsubl_s8(vget_low_s8(vb0123456789ABCDEF), vget_low_s8(vb_zero_point));
+ const int16x8_t vxb89ABCDEF = vsubl_high_s8(vb0123456789ABCDEF, vb_zero_point);
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(vget_low_s8(va0123456789ABCDEF), va_zero_point);
+ const int16x8_t vxa89ABCDEF = vsubl_s8(vget_high_s8(va0123456789ABCDEF), va_zero_point);
+ const int16x8_t vxb01234567 = vsubl_s8(vget_low_s8(vb0123456789ABCDEF), vb_zero_point);
+ const int16x8_t vxb89ABCDEF = vsubl_s8(vget_high_s8(vb0123456789ABCDEF), vb_zero_point);
+ #endif // XNN_ARCH_ARM64
+
+ int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc89AB = vmulq_s32(vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
+ int32x4_t vaccCDEF = vmulq_s32(vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);
+
+ vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
+ vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
+ vacc89AB = vmlaq_s32(vacc89AB, vmovl_s16(vget_low_s16(vxb89ABCDEF)), vb_multiplier);
+ vaccCDEF = vmlaq_s32(vaccCDEF, vmovl_s16(vget_high_s16(vxb89ABCDEF)), vb_multiplier);
+
+ vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+ vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+ vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
+ vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);
+
+ const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+ const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
+
+ int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
+
+ vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);
+
+ vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);
+
+ vst1q_s8(output, vout0123456789ABCDEF); output += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
+ const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8;
+
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(va01234567, vget_low_s8(va_zero_point));
+ const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vget_low_s8(vb_zero_point));
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
+ const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point);
+ #endif // XNN_ARCH_ARM64
+
+ int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+
+ vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
+ vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
+
+ vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+ vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+
+ const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+
+ int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
+ vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
+ vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));
+
+ if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
+ vst1_s8(output, vout01234567); output += 8;
+ n -= 8 * sizeof(int8_t);
+ } else {
+ if (n & (4 * sizeof(int8_t))) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_s8(vout01234567), 0); output += 4;
+ vout01234567 = vext_s8(vout01234567, vout01234567, 4);
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ vst1_lane_u16(__builtin_assume_aligned(output, 1), vreinterpret_u16_s8(vout01234567), 0); output += 2;
+ vout01234567 = vext_s8(vout01234567, vout01234567, 2);
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ vst1_lane_s8(output, vout01234567, 0);
+ }
+ n = 0;
+ }
+ } while (n != 0);
+ }
+}
diff --git a/src/qs8-vadd/gen/minmax-neon-ld128-x32.c b/src/qs8-vadd/gen/minmax-neon-ld128-x32.c
new file mode 100644
index 0000000..83a3798a
--- /dev/null
+++ b/src/qs8-vadd/gen/minmax-neon-ld128-x32.c
@@ -0,0 +1,155 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vadd/neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32(
+ size_t n,
+ const int8_t* input_a,
+ const int8_t* input_b,
+ int8_t* output,
+ const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ #if XNN_ARCH_ARM64
+ const int8x16_t va_zero_point = vld1q_dup_s8(¶ms->neon.a_zero_point);
+ const int8x16_t vb_zero_point = vld1q_dup_s8(¶ms->neon.b_zero_point);
+ #else
+ const int8x8_t va_zero_point = vld1_dup_s8(¶ms->neon.a_zero_point);
+ const int8x8_t vb_zero_point = vld1_dup_s8(¶ms->neon.b_zero_point);
+ #endif
+ const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier);
+ const int32x4_t vb_multiplier = vld1q_dup_s32(¶ms->neon.b_multiplier);
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
+ const int8x16_t va0123456789ABCDEF = vld1q_s8(input_a); input_a += 16;
+ const int8x16_t vb0123456789ABCDEF = vld1q_s8(input_b); input_b += 16;
+ const int8x16_t vaGHIJKLMNOPQRSTUV = vld1q_s8(input_a); input_a += 16;
+ const int8x16_t vbGHIJKLMNOPQRSTUV = vld1q_s8(input_b); input_b += 16;
+
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(vget_low_s8(va0123456789ABCDEF), vget_low_s8(va_zero_point));
+ const int16x8_t vxa89ABCDEF = vsubl_high_s8(va0123456789ABCDEF, va_zero_point);
+ const int16x8_t vxb01234567 = vsubl_s8(vget_low_s8(vb0123456789ABCDEF), vget_low_s8(vb_zero_point));
+ const int16x8_t vxb89ABCDEF = vsubl_high_s8(vb0123456789ABCDEF, vb_zero_point);
+ const int16x8_t vxaGHIJKLMN = vsubl_s8(vget_low_s8(vaGHIJKLMNOPQRSTUV), vget_low_s8(va_zero_point));
+ const int16x8_t vxaOPQRSTUV = vsubl_high_s8(vaGHIJKLMNOPQRSTUV, va_zero_point);
+ const int16x8_t vxbGHIJKLMN = vsubl_s8(vget_low_s8(vbGHIJKLMNOPQRSTUV), vget_low_s8(vb_zero_point));
+ const int16x8_t vxbOPQRSTUV = vsubl_high_s8(vbGHIJKLMNOPQRSTUV, vb_zero_point);
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(vget_low_s8(va0123456789ABCDEF), va_zero_point);
+ const int16x8_t vxa89ABCDEF = vsubl_s8(vget_high_s8(va0123456789ABCDEF), va_zero_point);
+ const int16x8_t vxb01234567 = vsubl_s8(vget_low_s8(vb0123456789ABCDEF), vb_zero_point);
+ const int16x8_t vxb89ABCDEF = vsubl_s8(vget_high_s8(vb0123456789ABCDEF), vb_zero_point);
+ const int16x8_t vxaGHIJKLMN = vsubl_s8(vget_low_s8(vaGHIJKLMNOPQRSTUV), va_zero_point);
+ const int16x8_t vxaOPQRSTUV = vsubl_s8(vget_high_s8(vaGHIJKLMNOPQRSTUV), va_zero_point);
+ const int16x8_t vxbGHIJKLMN = vsubl_s8(vget_low_s8(vbGHIJKLMNOPQRSTUV), vb_zero_point);
+ const int16x8_t vxbOPQRSTUV = vsubl_s8(vget_high_s8(vbGHIJKLMNOPQRSTUV), vb_zero_point);
+ #endif // XNN_ARCH_ARM64
+
+ int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc89AB = vmulq_s32(vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
+ int32x4_t vaccCDEF = vmulq_s32(vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);
+ int32x4_t vaccGHIJ = vmulq_s32(vmovl_s16(vget_low_s16(vxaGHIJKLMN)), va_multiplier);
+ int32x4_t vaccKLMN = vmulq_s32(vmovl_s16(vget_high_s16(vxaGHIJKLMN)), va_multiplier);
+ int32x4_t vaccOPQR = vmulq_s32(vmovl_s16(vget_low_s16(vxaOPQRSTUV)), va_multiplier);
+ int32x4_t vaccSTUV = vmulq_s32(vmovl_s16(vget_high_s16(vxaOPQRSTUV)), va_multiplier);
+
+ vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
+ vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
+ vacc89AB = vmlaq_s32(vacc89AB, vmovl_s16(vget_low_s16(vxb89ABCDEF)), vb_multiplier);
+ vaccCDEF = vmlaq_s32(vaccCDEF, vmovl_s16(vget_high_s16(vxb89ABCDEF)), vb_multiplier);
+ vaccGHIJ = vmlaq_s32(vaccGHIJ, vmovl_s16(vget_low_s16(vxbGHIJKLMN)), vb_multiplier);
+ vaccKLMN = vmlaq_s32(vaccKLMN, vmovl_s16(vget_high_s16(vxbGHIJKLMN)), vb_multiplier);
+ vaccOPQR = vmlaq_s32(vaccOPQR, vmovl_s16(vget_low_s16(vxbOPQRSTUV)), vb_multiplier);
+ vaccSTUV = vmlaq_s32(vaccSTUV, vmovl_s16(vget_high_s16(vxbOPQRSTUV)), vb_multiplier);
+
+ vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+ vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+ vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
+ vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);
+ vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_shift);
+ vaccKLMN = vrshlq_s32(vaccKLMN, vright_shift);
+ vaccOPQR = vrshlq_s32(vaccOPQR, vright_shift);
+ vaccSTUV = vrshlq_s32(vaccSTUV, vright_shift);
+
+ const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+ const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
+ const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point);
+ const int16x8_t vaccOPQRSTUV = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)), voutput_zero_point);
+
+ int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
+ int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV));
+
+ vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min);
+
+ vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max);
+
+ vst1q_s8(output, vout0123456789ABCDEF); output += 16;
+ vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
+ const int8x8_t vb01234567 = vld1_s8(input_b); input_b += 8;
+
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(va01234567, vget_low_s8(va_zero_point));
+ const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vget_low_s8(vb_zero_point));
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
+ const int16x8_t vxb01234567 = vsubl_s8(vb01234567, vb_zero_point);
+ #endif // XNN_ARCH_ARM64
+
+ int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+
+ vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
+ vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
+
+ vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+ vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+
+ const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+
+ int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
+ vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
+ vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));
+
+ if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
+ vst1_s8(output, vout01234567); output += 8;
+ n -= 8 * sizeof(int8_t);
+ } else {
+ if (n & (4 * sizeof(int8_t))) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_s8(vout01234567), 0); output += 4;
+ vout01234567 = vext_s8(vout01234567, vout01234567, 4);
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ vst1_lane_u16(__builtin_assume_aligned(output, 1), vreinterpret_u16_s8(vout01234567), 0); output += 2;
+ vout01234567 = vext_s8(vout01234567, vout01234567, 2);
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ vst1_lane_s8(output, vout01234567, 0);
+ }
+ n = 0;
+ }
+ } while (n != 0);
+ }
+}
diff --git a/src/qs8-vadd/gen/minmax-neon-ld64-x16.c b/src/qs8-vadd/gen/minmax-neon-ld64-x16.c
index 601cc33..92ceba3 100644
--- a/src/qs8-vadd/gen/minmax-neon-ld64-x16.c
+++ b/src/qs8-vadd/gen/minmax-neon-ld64-x16.c
@@ -1,5 +1,5 @@
// Auto-generated file. Do not edit!
-// Template: src/qs8-vadd/neon-ld64.c.in
+// Template: src/qs8-vadd/neon.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
diff --git a/src/qs8-vadd/gen/minmax-neon-ld64-x24.c b/src/qs8-vadd/gen/minmax-neon-ld64-x24.c
index 63a8c72..2fea004 100644
--- a/src/qs8-vadd/gen/minmax-neon-ld64-x24.c
+++ b/src/qs8-vadd/gen/minmax-neon-ld64-x24.c
@@ -1,5 +1,5 @@
// Auto-generated file. Do not edit!
-// Template: src/qs8-vadd/neon-ld64.c.in
+// Template: src/qs8-vadd/neon.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
diff --git a/src/qs8-vadd/gen/minmax-neon-ld64-x32.c b/src/qs8-vadd/gen/minmax-neon-ld64-x32.c
index e460dd2..a2e1bd9 100644
--- a/src/qs8-vadd/gen/minmax-neon-ld64-x32.c
+++ b/src/qs8-vadd/gen/minmax-neon-ld64-x32.c
@@ -1,5 +1,5 @@
// Auto-generated file. Do not edit!
-// Template: src/qs8-vadd/neon-ld64.c.in
+// Template: src/qs8-vadd/neon.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
diff --git a/src/qs8-vadd/gen/minmax-neon-ld64-x8.c b/src/qs8-vadd/gen/minmax-neon-ld64-x8.c
index c2056fe..ca26da7 100644
--- a/src/qs8-vadd/gen/minmax-neon-ld64-x8.c
+++ b/src/qs8-vadd/gen/minmax-neon-ld64-x8.c
@@ -1,5 +1,5 @@
// Auto-generated file. Do not edit!
-// Template: src/qs8-vadd/neon-ld64.c.in
+// Template: src/qs8-vadd/neon.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
@@ -27,8 +27,8 @@
const int32x4_t vb_multiplier = vld1q_dup_s32(¶ms->neon.b_multiplier);
const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
- const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
- const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+ const int8x8_t voutput_min = vld1_dup_s8(¶ms->neon.output_min);
+ const int8x8_t voutput_max = vld1_dup_s8(¶ms->neon.output_max);
for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
@@ -50,9 +50,9 @@
int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
- vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
+ vout01234567 = vmax_s8(vout01234567, voutput_min);
- vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));
+ vout01234567 = vmin_s8(vout01234567, voutput_max);
vst1_s8(output, vout01234567); output += 8;
}
@@ -76,8 +76,8 @@
const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
- vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
- vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));
+ vout01234567 = vmax_s8(vout01234567, voutput_min);
+ vout01234567 = vmin_s8(vout01234567, voutput_max);
if (n & (4 * sizeof(int8_t))) {
vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_s8(vout01234567), 0); output += 4;
diff --git a/src/qs8-vadd/neon-ld64.c.in b/src/qs8-vadd/neon-ld64.c.in
deleted file mode 100644
index 54bdc3e..0000000
--- a/src/qs8-vadd/neon-ld64.c.in
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright 2020 Google LLC
-//
-// This source code is licensed under the BSD-style license found in the
-// LICENSE file in the root directory of this source tree.
-
-$assert DATATYPE in ["QS8", "QU8"]
-$assert BATCH_TILE % 8 == 0
-$assert BATCH_TILE >= 8
-$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-#include <assert.h>
-
-#include <arm_neon.h>
-
-#include <xnnpack/vadd.h>
-
-
-$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
-$XINT8X8_T = {"QS8": "int8x8_t", "QU8": "uint8x8_t"}[DATATYPE]
-$XINT8X16_T = {"QS8": "int8x16_t", "QU8": "uint8x16_t"}[DATATYPE]
-$VLD1_X8 = {"QS8": "vld1_s8", "QU8": "vld1_u8"}[DATATYPE]
-$VLD1_DUP_X8 = {"QS8": "vld1_dup_s8", "QU8": "vld1_dup_u8"}[DATATYPE]
-$VLD1Q_DUP_X8 = {"QS8": "vld1q_dup_s8", "QU8": "vld1q_dup_u8"}[DATATYPE]
-$VST1_LANE_X8 = {"QS8": "vst1_lane_s8", "QU8": "vst1_lane_u8"}[DATATYPE]
-$VST1_X8 = {"QS8": "vst1_s8", "QU8": "vst1_u8"}[DATATYPE]
-$VST1Q_X8 = {"QS8": "vst1q_s8", "QU8": "vst1q_u8"}[DATATYPE]
-$VMIN_X8 = {"QS8": "vmin_s8", "QU8": "vmin_u8"}[DATATYPE]
-$VMAX_X8 = {"QS8": "vmax_s8", "QU8": "vmax_u8"}[DATATYPE]
-$VMINQ_X8 = {"QS8": "vminq_s8", "QU8": "vminq_u8"}[DATATYPE]
-$VMAXQ_X8 = {"QS8": "vmaxq_s8", "QU8": "vmaxq_u8"}[DATATYPE]
-$VQMOVXN_S16 = {"QS8": "vqmovn_s16", "QU8": "vqmovun_s16"}[DATATYPE]
-$VEXT_X8 = {"QS8": "vext_s8", "QU8": "vext_u8"}[DATATYPE]
-$VGET_LOW_X8 = {"QS8": "vget_low_s8", "QU8": "vget_low_u8"}[DATATYPE]
-$VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE]
-$VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE]
-$VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE]
-void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__neon_ld64_x${BATCH_TILE}(
- size_t n,
- const ${XINT8_T}* input_a,
- const ${XINT8_T}* input_b,
- ${XINT8_T}* output,
- const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
-{
- const ${XINT8X8_T} va_zero_point = ${VLD1_DUP_X8}(¶ms->neon.a_zero_point);
- const ${XINT8X8_T} vb_zero_point = ${VLD1_DUP_X8}(¶ms->neon.b_zero_point);
- const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier);
- const int32x4_t vb_multiplier = vld1q_dup_s32(¶ms->neon.b_multiplier);
- const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
- const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
- const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(¶ms->neon.output_min);
- const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(¶ms->neon.output_max);
-
- for (; n >= ${BATCH_TILE} * sizeof(${XINT8_T}); n -= ${BATCH_TILE} * sizeof(${XINT8_T})) {
- $for N in range(0, BATCH_TILE, 8):
- const ${XINT8X8_T} va${ABC[N:N+8]} = ${VLD1_X8}(input_a); input_a += 8;
- const ${XINT8X8_T} vb${ABC[N:N+8]} = ${VLD1_X8}(input_b); input_b += 8;
-
- $for N in range(0, BATCH_TILE, 8):
- $if DATATYPE == "QU8":
- const int16x8_t vxa${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[N:N+8]}, va_zero_point));
- const int16x8_t vxb${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[N:N+8]}, vb_zero_point));
- $else:
- const int16x8_t vxa${ABC[N:N+8]} = vsubl_s8(va${ABC[N:N+8]}, va_zero_point);
- const int16x8_t vxb${ABC[N:N+8]} = vsubl_s8(vb${ABC[N:N+8]}, vb_zero_point);
-
- $for N in range(0, BATCH_TILE, 8):
- int32x4_t vacc${ABC[N:N+4]} = vmulq_s32(vmovl_s16(vget_low_s16(vxa${ABC[N:N+8]})), va_multiplier);
- int32x4_t vacc${ABC[N+4:N+8]} = vmulq_s32(vmovl_s16(vget_high_s16(vxa${ABC[N:N+8]})), va_multiplier);
-
- $for N in range(0, BATCH_TILE, 8):
- vacc${ABC[N:N+4]} = vmlaq_s32(vacc${ABC[N:N+4]}, vmovl_s16(vget_low_s16(vxb${ABC[N:N+8]})), vb_multiplier);
- vacc${ABC[N+4:N+8]} = vmlaq_s32(vacc${ABC[N+4:N+8]}, vmovl_s16(vget_high_s16(vxb${ABC[N:N+8]})), vb_multiplier);
-
- $for N in range(0, BATCH_TILE, 4):
- vacc${ABC[N:N+4]} = vrshlq_s32(vacc${ABC[N:N+4]}, vright_shift);
-
- $for N in range(0, BATCH_TILE, 8):
- const int16x8_t vacc${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${ABC[N:N+4]}), vqmovn_s32(vacc${ABC[N+4:N+8]})), voutput_zero_point);
-
- $for N in range(0, BATCH_TILE, 16):
- $if N + 8 < BATCH_TILE:
- ${XINT8X16_T} vout${ABC[N:N+16]} = ${VCOMBINE_X8}(${VQMOVXN_S16}(vacc${ABC[N:N+8]}), ${VQMOVXN_S16}(vacc${ABC[N+8:N+16]}));
- $else:
- ${XINT8X8_T} vout${ABC[N:N+8]} = ${VQMOVXN_S16}(vacc${ABC[N:N+8]});
-
- $for N in range(0, BATCH_TILE, 16):
- $if N + 8 < BATCH_TILE:
- vout${ABC[N:N+16]} = ${VMAXQ_X8}(vout${ABC[N:N+16]}, voutput_min);
- $else:
- vout${ABC[N:N+8]} = ${VMAX_X8}(vout${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_min));
-
- $for N in range(0, BATCH_TILE, 16):
- $if N + 8 < BATCH_TILE:
- vout${ABC[N:N+16]} = ${VMINQ_X8}(vout${ABC[N:N+16]}, voutput_max);
- $else:
- vout${ABC[N:N+8]} = ${VMIN_X8}(vout${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_max));
-
- $for N in range(0, BATCH_TILE, 16):
- $if N + 8 < BATCH_TILE:
- ${VST1Q_X8}(output, vout${ABC[N:N+16]}); output += 16;
- $else:
- ${VST1_X8}(output, vout${ABC[N:N+8]}); output += 8;
- }
- if XNN_UNLIKELY(n != 0) {
- ${"do " if BATCH_TILE > 8 else ""}{
- $if BATCH_TILE > 8:
- const ${XINT8X8_T} va${ABC[0:8]} = ${VLD1_X8}(input_a); input_a += 8;
- const ${XINT8X8_T} vb${ABC[0:8]} = ${VLD1_X8}(input_b); input_b += 8;
- $else:
- const ${XINT8X8_T} va${ABC[0:8]} = ${VLD1_X8}(input_a);
- const ${XINT8X8_T} vb${ABC[0:8]} = ${VLD1_X8}(input_b);
-
- $if DATATYPE == "QU8":
- const int16x8_t vxa${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[0:8]}, va_zero_point));
- const int16x8_t vxb${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[0:8]}, vb_zero_point));
- $else:
- const int16x8_t vxa${ABC[0:8]} = vsubl_s8(va${ABC[0:8]}, va_zero_point);
- const int16x8_t vxb${ABC[0:8]} = vsubl_s8(vb${ABC[0:8]}, vb_zero_point);
-
- int32x4_t vacc${ABC[0:4]} = vmulq_s32(vmovl_s16(vget_low_s16(vxa${ABC[0:8]})), va_multiplier);
- int32x4_t vacc${ABC[4:8]} = vmulq_s32(vmovl_s16(vget_high_s16(vxa${ABC[0:8]})), va_multiplier);
-
- vacc${ABC[0:4]} = vmlaq_s32(vacc${ABC[0:4]}, vmovl_s16(vget_low_s16(vxb${ABC[0:8]})), vb_multiplier);
- vacc${ABC[4:8]} = vmlaq_s32(vacc${ABC[4:8]}, vmovl_s16(vget_high_s16(vxb${ABC[0:8]})), vb_multiplier);
-
- vacc${ABC[0:4]} = vrshlq_s32(vacc${ABC[0:4]}, vright_shift);
- vacc${ABC[4:8]} = vrshlq_s32(vacc${ABC[4:8]}, vright_shift);
-
- const int16x8_t vacc${ABC[0:8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${ABC[0:4]}), vqmovn_s32(vacc${ABC[4:8]})), voutput_zero_point);
-
- ${XINT8X8_T} vout${ABC[0:8]} = ${VQMOVXN_S16}(vacc${ABC[0:8]});
- vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_min));
- vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_max));
-
- $if BATCH_TILE > 8:
- if XNN_LIKELY(n >= (8 * sizeof(${XINT8_T}))) {
- ${VST1_X8}(output, vout${ABC[0:8]}); output += 8;
- n -= 8 * sizeof(${XINT8_T});
- } else {
- if (n & (4 * sizeof(${XINT8_T}))) {
- vst1_lane_u32(__builtin_assume_aligned(output, 1), ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4;
- vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
- }
- if (n & (2 * sizeof(${XINT8_T}))) {
- vst1_lane_u16(__builtin_assume_aligned(output, 1), ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2;
- vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
- }
- if (n & (1 * sizeof(${XINT8_T}))) {
- ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0);
- }
- n = 0;
- }
- $else:
- if (n & (4 * sizeof(${XINT8_T}))) {
- vst1_lane_u32(__builtin_assume_aligned(output, 1), ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4;
- vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
- }
- if (n & (2 * sizeof(${XINT8_T}))) {
- vst1_lane_u16(__builtin_assume_aligned(output, 1), ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2;
- vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
- }
- if (n & (1 * sizeof(${XINT8_T}))) {
- ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0);
- }
- }${" while (n != 0);" if BATCH_TILE > 8 else ""}
- }
-}
diff --git a/src/qs8-vadd/neon.c.in b/src/qs8-vadd/neon.c.in
new file mode 100644
index 0000000..f6814f7
--- /dev/null
+++ b/src/qs8-vadd/neon.c.in
@@ -0,0 +1,237 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert DATATYPE in ["QS8", "QU8"]
+$assert BATCH_TILE % (16 if LD128 else 8) == 0
+$assert BATCH_TILE >= 8
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/vadd.h>
+
+
+$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
+$XINT8X8_T = {"QS8": "int8x8_t", "QU8": "uint8x8_t"}[DATATYPE]
+$XINT8X16_T = {"QS8": "int8x16_t", "QU8": "uint8x16_t"}[DATATYPE]
+$VLD1_X8 = {"QS8": "vld1_s8", "QU8": "vld1_u8"}[DATATYPE]
+$VLD1Q_X8 = {"QS8": "vld1q_s8", "QU8": "vld1q_u8"}[DATATYPE]
+$VLD1_DUP_X8 = {"QS8": "vld1_dup_s8", "QU8": "vld1_dup_u8"}[DATATYPE]
+$VLD1Q_DUP_X8 = {"QS8": "vld1q_dup_s8", "QU8": "vld1q_dup_u8"}[DATATYPE]
+$VST1_LANE_X8 = {"QS8": "vst1_lane_s8", "QU8": "vst1_lane_u8"}[DATATYPE]
+$VST1_X8 = {"QS8": "vst1_s8", "QU8": "vst1_u8"}[DATATYPE]
+$VST1Q_X8 = {"QS8": "vst1q_s8", "QU8": "vst1q_u8"}[DATATYPE]
+$VMIN_X8 = {"QS8": "vmin_s8", "QU8": "vmin_u8"}[DATATYPE]
+$VMAX_X8 = {"QS8": "vmax_s8", "QU8": "vmax_u8"}[DATATYPE]
+$VMINQ_X8 = {"QS8": "vminq_s8", "QU8": "vminq_u8"}[DATATYPE]
+$VMAXQ_X8 = {"QS8": "vmaxq_s8", "QU8": "vmaxq_u8"}[DATATYPE]
+$VQMOVXN_S16 = {"QS8": "vqmovn_s16", "QU8": "vqmovun_s16"}[DATATYPE]
+$VEXT_X8 = {"QS8": "vext_s8", "QU8": "vext_u8"}[DATATYPE]
+$VGET_LOW_X8 = {"QS8": "vget_low_s8", "QU8": "vget_low_u8"}[DATATYPE]
+$VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE]
+$VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE]
+$VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE]
+void xnn_${DATATYPE.lower()}_vadd_minmax_ukernel__neon_${"ld128" if LD128 else "ld64"}_x${BATCH_TILE}(
+ size_t n,
+ const ${XINT8_T}* input_a,
+ const ${XINT8_T}* input_b,
+ ${XINT8_T}* output,
+ const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ $if LD128:
+ #if XNN_ARCH_ARM64
+ const ${XINT8X16_T} va_zero_point = ${VLD1Q_DUP_X8}(¶ms->neon.a_zero_point);
+ const ${XINT8X16_T} vb_zero_point = ${VLD1Q_DUP_X8}(¶ms->neon.b_zero_point);
+ #else
+ const ${XINT8X8_T} va_zero_point = ${VLD1_DUP_X8}(¶ms->neon.a_zero_point);
+ const ${XINT8X8_T} vb_zero_point = ${VLD1_DUP_X8}(¶ms->neon.b_zero_point);
+ #endif
+ $else:
+ const ${XINT8X8_T} va_zero_point = ${VLD1_DUP_X8}(¶ms->neon.a_zero_point);
+ const ${XINT8X8_T} vb_zero_point = ${VLD1_DUP_X8}(¶ms->neon.b_zero_point);
+ const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier);
+ const int32x4_t vb_multiplier = vld1q_dup_s32(¶ms->neon.b_multiplier);
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+ $if BATCH_TILE >= 16:
+ const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(¶ms->neon.output_min);
+ const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(¶ms->neon.output_max);
+ $else:
+ const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(¶ms->neon.output_min);
+ const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(¶ms->neon.output_max);
+
+ for (; n >= ${BATCH_TILE} * sizeof(${XINT8_T}); n -= ${BATCH_TILE} * sizeof(${XINT8_T})) {
+ $if LD128:
+ $for N in range(0, BATCH_TILE, 16):
+ const ${XINT8X16_T} va${ABC[N:N+16]} = ${VLD1Q_X8}(input_a); input_a += 16;
+ const ${XINT8X16_T} vb${ABC[N:N+16]} = ${VLD1Q_X8}(input_b); input_b += 16;
+
+ #if XNN_ARCH_ARM64
+ $for N in range(0, BATCH_TILE, 16):
+ $if DATATYPE == "QU8":
+ const int16x8_t vxa${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va${ABC[N:N+16]}), vget_low_u8(va_zero_point)));
+ const int16x8_t vxa${ABC[N+8:N+16]} = vreinterpretq_s16_u16(vsubl_high_u8(va${ABC[N:N+16]}, va_zero_point));
+ const int16x8_t vxb${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb${ABC[N:N+16]}), vget_low_u8(vb_zero_point)));
+ const int16x8_t vxb${ABC[N+8:N+16]} = vreinterpretq_s16_u16(vsubl_high_u8(vb${ABC[N:N+16]}, vb_zero_point));
+ $else:
+ const int16x8_t vxa${ABC[N:N+8]} = vsubl_s8(vget_low_s8(va${ABC[N:N+16]}), vget_low_s8(va_zero_point));
+ const int16x8_t vxa${ABC[N+8:N+16]} = vsubl_high_s8(va${ABC[N:N+16]}, va_zero_point);
+ const int16x8_t vxb${ABC[N:N+8]} = vsubl_s8(vget_low_s8(vb${ABC[N:N+16]}), vget_low_s8(vb_zero_point));
+ const int16x8_t vxb${ABC[N+8:N+16]} = vsubl_high_s8(vb${ABC[N:N+16]}, vb_zero_point);
+ #else // !XNN_ARCH_ARM64
+ $for N in range(0, BATCH_TILE, 16):
+ $if DATATYPE == "QU8":
+ const int16x8_t vxa${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va${ABC[N:N+16]}), va_zero_point));
+ const int16x8_t vxa${ABC[N+8:N+16]} = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va${ABC[N:N+16]}), va_zero_point));
+ const int16x8_t vxb${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb${ABC[N:N+16]}), vb_zero_point));
+ const int16x8_t vxb${ABC[N+8:N+16]} = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(vb${ABC[N:N+16]}), vb_zero_point));
+ $else:
+ const int16x8_t vxa${ABC[N:N+8]} = vsubl_s8(vget_low_s8(va${ABC[N:N+16]}), va_zero_point);
+ const int16x8_t vxa${ABC[N+8:N+16]} = vsubl_s8(vget_high_s8(va${ABC[N:N+16]}), va_zero_point);
+ const int16x8_t vxb${ABC[N:N+8]} = vsubl_s8(vget_low_s8(vb${ABC[N:N+16]}), vb_zero_point);
+ const int16x8_t vxb${ABC[N+8:N+16]} = vsubl_s8(vget_high_s8(vb${ABC[N:N+16]}), vb_zero_point);
+ #endif // XNN_ARCH_ARM64
+ $else:
+ $for N in range(0, BATCH_TILE, 8):
+ const ${XINT8X8_T} va${ABC[N:N+8]} = ${VLD1_X8}(input_a); input_a += 8;
+ const ${XINT8X8_T} vb${ABC[N:N+8]} = ${VLD1_X8}(input_b); input_b += 8;
+
+ $for N in range(0, BATCH_TILE, 8):
+ $if DATATYPE == "QU8":
+ const int16x8_t vxa${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[N:N+8]}, va_zero_point));
+ const int16x8_t vxb${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[N:N+8]}, vb_zero_point));
+ $else:
+ const int16x8_t vxa${ABC[N:N+8]} = vsubl_s8(va${ABC[N:N+8]}, va_zero_point);
+ const int16x8_t vxb${ABC[N:N+8]} = vsubl_s8(vb${ABC[N:N+8]}, vb_zero_point);
+
+ $for N in range(0, BATCH_TILE, 8):
+ int32x4_t vacc${ABC[N:N+4]} = vmulq_s32(vmovl_s16(vget_low_s16(vxa${ABC[N:N+8]})), va_multiplier);
+ int32x4_t vacc${ABC[N+4:N+8]} = vmulq_s32(vmovl_s16(vget_high_s16(vxa${ABC[N:N+8]})), va_multiplier);
+
+ $for N in range(0, BATCH_TILE, 8):
+ vacc${ABC[N:N+4]} = vmlaq_s32(vacc${ABC[N:N+4]}, vmovl_s16(vget_low_s16(vxb${ABC[N:N+8]})), vb_multiplier);
+ vacc${ABC[N+4:N+8]} = vmlaq_s32(vacc${ABC[N+4:N+8]}, vmovl_s16(vget_high_s16(vxb${ABC[N:N+8]})), vb_multiplier);
+
+ $for N in range(0, BATCH_TILE, 4):
+ vacc${ABC[N:N+4]} = vrshlq_s32(vacc${ABC[N:N+4]}, vright_shift);
+
+ $for N in range(0, BATCH_TILE, 8):
+ const int16x8_t vacc${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${ABC[N:N+4]}), vqmovn_s32(vacc${ABC[N+4:N+8]})), voutput_zero_point);
+
+ $for N in range(0, BATCH_TILE, 16):
+ $if N + 8 < BATCH_TILE:
+ ${XINT8X16_T} vout${ABC[N:N+16]} = ${VCOMBINE_X8}(${VQMOVXN_S16}(vacc${ABC[N:N+8]}), ${VQMOVXN_S16}(vacc${ABC[N+8:N+16]}));
+ $else:
+ ${XINT8X8_T} vout${ABC[N:N+8]} = ${VQMOVXN_S16}(vacc${ABC[N:N+8]});
+
+ $for N in range(0, BATCH_TILE, 16):
+ $if N + 8 < BATCH_TILE:
+ vout${ABC[N:N+16]} = ${VMAXQ_X8}(vout${ABC[N:N+16]}, voutput_min);
+ $elif BATCH_TILE >= 16:
+ vout${ABC[N:N+8]} = ${VMAX_X8}(vout${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_min));
+ $else:
+ vout${ABC[N:N+8]} = ${VMAX_X8}(vout${ABC[N:N+8]}, voutput_min);
+
+ $for N in range(0, BATCH_TILE, 16):
+ $if N + 8 < BATCH_TILE:
+ vout${ABC[N:N+16]} = ${VMINQ_X8}(vout${ABC[N:N+16]}, voutput_max);
+ $elif BATCH_TILE >= 16:
+ vout${ABC[N:N+8]} = ${VMIN_X8}(vout${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_max));
+ $else:
+ vout${ABC[N:N+8]} = ${VMIN_X8}(vout${ABC[N:N+8]}, voutput_max);
+
+ $for N in range(0, BATCH_TILE, 16):
+ $if N + 8 < BATCH_TILE:
+ ${VST1Q_X8}(output, vout${ABC[N:N+16]}); output += 16;
+ $else:
+ ${VST1_X8}(output, vout${ABC[N:N+8]}); output += 8;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ ${"do " if BATCH_TILE > 8 else ""}{
+ $if BATCH_TILE > 8:
+ const ${XINT8X8_T} va${ABC[0:8]} = ${VLD1_X8}(input_a); input_a += 8;
+ const ${XINT8X8_T} vb${ABC[0:8]} = ${VLD1_X8}(input_b); input_b += 8;
+ $else:
+ const ${XINT8X8_T} va${ABC[0:8]} = ${VLD1_X8}(input_a);
+ const ${XINT8X8_T} vb${ABC[0:8]} = ${VLD1_X8}(input_b);
+
+ $if LD128:
+ $if DATATYPE == "QU8":
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[0:8]}, vget_low_u8(va_zero_point)));
+ const int16x8_t vxb${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[0:8]}, vget_low_u8(vb_zero_point)));
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[0:8]}, va_zero_point));
+ const int16x8_t vxb${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[0:8]}, vb_zero_point));
+ #endif // XNN_ARCH_ARM64
+ $else:
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa${ABC[0:8]} = vsubl_s8(va${ABC[0:8]}, vget_low_s8(va_zero_point));
+ const int16x8_t vxb${ABC[0:8]} = vsubl_s8(vb${ABC[0:8]}, vget_low_s8(vb_zero_point));
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa${ABC[0:8]} = vsubl_s8(va${ABC[0:8]}, va_zero_point);
+ const int16x8_t vxb${ABC[0:8]} = vsubl_s8(vb${ABC[0:8]}, vb_zero_point);
+ #endif // XNN_ARCH_ARM64
+ $else:
+ $if DATATYPE == "QU8":
+ const int16x8_t vxa${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[0:8]}, va_zero_point));
+ const int16x8_t vxb${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(vb${ABC[0:8]}, vb_zero_point));
+ $else:
+ const int16x8_t vxa${ABC[0:8]} = vsubl_s8(va${ABC[0:8]}, va_zero_point);
+ const int16x8_t vxb${ABC[0:8]} = vsubl_s8(vb${ABC[0:8]}, vb_zero_point);
+
+ int32x4_t vacc${ABC[0:4]} = vmulq_s32(vmovl_s16(vget_low_s16(vxa${ABC[0:8]})), va_multiplier);
+ int32x4_t vacc${ABC[4:8]} = vmulq_s32(vmovl_s16(vget_high_s16(vxa${ABC[0:8]})), va_multiplier);
+
+ vacc${ABC[0:4]} = vmlaq_s32(vacc${ABC[0:4]}, vmovl_s16(vget_low_s16(vxb${ABC[0:8]})), vb_multiplier);
+ vacc${ABC[4:8]} = vmlaq_s32(vacc${ABC[4:8]}, vmovl_s16(vget_high_s16(vxb${ABC[0:8]})), vb_multiplier);
+
+ vacc${ABC[0:4]} = vrshlq_s32(vacc${ABC[0:4]}, vright_shift);
+ vacc${ABC[4:8]} = vrshlq_s32(vacc${ABC[4:8]}, vright_shift);
+
+ const int16x8_t vacc${ABC[0:8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${ABC[0:4]}), vqmovn_s32(vacc${ABC[4:8]})), voutput_zero_point);
+
+ ${XINT8X8_T} vout${ABC[0:8]} = ${VQMOVXN_S16}(vacc${ABC[0:8]});
+ $if BATCH_TILE >= 16:
+ vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_min));
+ vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_max));
+ $else:
+ vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, voutput_min);
+ vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, voutput_max);
+
+ $if BATCH_TILE > 8:
+ if XNN_LIKELY(n >= (8 * sizeof(${XINT8_T}))) {
+ ${VST1_X8}(output, vout${ABC[0:8]}); output += 8;
+ n -= 8 * sizeof(${XINT8_T});
+ } else {
+ if (n & (4 * sizeof(${XINT8_T}))) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4;
+ vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
+ }
+ if (n & (2 * sizeof(${XINT8_T}))) {
+ vst1_lane_u16(__builtin_assume_aligned(output, 1), ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2;
+ vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
+ }
+ if (n & (1 * sizeof(${XINT8_T}))) {
+ ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0);
+ }
+ n = 0;
+ }
+ $else:
+ if (n & (4 * sizeof(${XINT8_T}))) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), ${VREINTERPRET_U32_X8}(vout${ABC[0:8]}), 0); output += 4;
+ vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 4);
+ }
+ if (n & (2 * sizeof(${XINT8_T}))) {
+ vst1_lane_u16(__builtin_assume_aligned(output, 1), ${VREINTERPRET_U16_X8}(vout${ABC[0:8]}), 0); output += 2;
+ vout${ABC[0:8]} = ${VEXT_X8}(vout${ABC[0:8]}, vout${ABC[0:8]}, 2);
+ }
+ if (n & (1 * sizeof(${XINT8_T}))) {
+ ${VST1_LANE_X8}(output, vout${ABC[0:8]}, 0);
+ }
+ }${" while (n != 0);" if BATCH_TILE > 8 else ""}
+ }
+}
diff --git a/src/qs8-vaddc/gen/minmax-neon-ld128-x16.c b/src/qs8-vaddc/gen/minmax-neon-ld128-x16.c
new file mode 100644
index 0000000..519f079
--- /dev/null
+++ b/src/qs8-vaddc/gen/minmax-neon-ld128-x16.c
@@ -0,0 +1,112 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vaddc/neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16(
+ size_t n,
+ const int8_t* input_a,
+ const int8_t* input_b,
+ int8_t* output,
+ const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ #if XNN_ARCH_ARM64
+ const int8x16_t va_zero_point = vld1q_dup_s8(¶ms->neon.a_zero_point);
+ #else
+ const int8x8_t va_zero_point = vld1_dup_s8(¶ms->neon.a_zero_point);
+ #endif
+ const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier);
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point;
+ const int32_t vb = params->neon.b_multiplier;
+ const int32x4_t vbias = vdupq_n_s32(vxb * vb);
+
+ for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
+ const int8x16_t va0123456789ABCDEF = vld1q_s8(input_a); input_a += 16;
+
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(vget_low_s8(va0123456789ABCDEF), vget_low_s8(va_zero_point));
+ const int16x8_t vxa89ABCDEF = vsubl_high_s8(va0123456789ABCDEF, va_zero_point);
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(vget_low_s8(va0123456789ABCDEF), va_zero_point);
+ const int16x8_t vxa89ABCDEF = vsubl_s8(vget_high_s8(va0123456789ABCDEF), va_zero_point);
+ #endif // XNN_ARCH_ARM64
+
+ int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc89AB = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
+ int32x4_t vaccCDEF = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);
+
+ vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+ vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+ vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
+ vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);
+
+ const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+ const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
+
+ int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
+
+ vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);
+
+ vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);
+
+ vst1q_s8(output, vout0123456789ABCDEF); output += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
+
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(va01234567, vget_low_s8(va_zero_point));
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
+ #endif
+
+ int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+
+ vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+ vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+
+ const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+
+ int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
+ vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
+ vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));
+
+ if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
+ vst1_s8(output, vout01234567); output += 8;
+ n -= 8 * sizeof(int8_t);
+ } else {
+ if (n & (4 * sizeof(int8_t))) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_s8(vout01234567), 0); output += 4;
+ vout01234567 = vext_s8(vout01234567, vout01234567, 4);
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ vst1_lane_u16(__builtin_assume_aligned(output, 1), vreinterpret_u16_s8(vout01234567), 0); output += 2;
+ vout01234567 = vext_s8(vout01234567, vout01234567, 2);
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ vst1_lane_s8(output, vout01234567, 0);
+ }
+ n = 0;
+ }
+ } while (n != 0);
+ }
+}
diff --git a/src/qs8-vaddc/gen/minmax-neon-ld128-x32.c b/src/qs8-vaddc/gen/minmax-neon-ld128-x32.c
new file mode 100644
index 0000000..e8dbdd6
--- /dev/null
+++ b/src/qs8-vaddc/gen/minmax-neon-ld128-x32.c
@@ -0,0 +1,131 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vaddc/neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32(
+ size_t n,
+ const int8_t* input_a,
+ const int8_t* input_b,
+ int8_t* output,
+ const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ #if XNN_ARCH_ARM64
+ const int8x16_t va_zero_point = vld1q_dup_s8(¶ms->neon.a_zero_point);
+ #else
+ const int8x8_t va_zero_point = vld1_dup_s8(¶ms->neon.a_zero_point);
+ #endif
+ const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier);
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point;
+ const int32_t vb = params->neon.b_multiplier;
+ const int32x4_t vbias = vdupq_n_s32(vxb * vb);
+
+ for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
+ const int8x16_t va0123456789ABCDEF = vld1q_s8(input_a); input_a += 16;
+ const int8x16_t vaGHIJKLMNOPQRSTUV = vld1q_s8(input_a); input_a += 16;
+
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(vget_low_s8(va0123456789ABCDEF), vget_low_s8(va_zero_point));
+ const int16x8_t vxa89ABCDEF = vsubl_high_s8(va0123456789ABCDEF, va_zero_point);
+ const int16x8_t vxaGHIJKLMN = vsubl_s8(vget_low_s8(vaGHIJKLMNOPQRSTUV), vget_low_s8(va_zero_point));
+ const int16x8_t vxaOPQRSTUV = vsubl_high_s8(vaGHIJKLMNOPQRSTUV, va_zero_point);
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(vget_low_s8(va0123456789ABCDEF), va_zero_point);
+ const int16x8_t vxa89ABCDEF = vsubl_s8(vget_high_s8(va0123456789ABCDEF), va_zero_point);
+ const int16x8_t vxaGHIJKLMN = vsubl_s8(vget_low_s8(vaGHIJKLMNOPQRSTUV), va_zero_point);
+ const int16x8_t vxaOPQRSTUV = vsubl_s8(vget_high_s8(vaGHIJKLMNOPQRSTUV), va_zero_point);
+ #endif // XNN_ARCH_ARM64
+
+ int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc89AB = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
+ int32x4_t vaccCDEF = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);
+ int32x4_t vaccGHIJ = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxaGHIJKLMN)), va_multiplier);
+ int32x4_t vaccKLMN = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxaGHIJKLMN)), va_multiplier);
+ int32x4_t vaccOPQR = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxaOPQRSTUV)), va_multiplier);
+ int32x4_t vaccSTUV = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxaOPQRSTUV)), va_multiplier);
+
+ vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+ vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+ vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
+ vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);
+ vaccGHIJ = vrshlq_s32(vaccGHIJ, vright_shift);
+ vaccKLMN = vrshlq_s32(vaccKLMN, vright_shift);
+ vaccOPQR = vrshlq_s32(vaccOPQR, vright_shift);
+ vaccSTUV = vrshlq_s32(vaccSTUV, vright_shift);
+
+ const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+ const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
+ const int16x8_t vaccGHIJKLMN = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccGHIJ), vqmovn_s32(vaccKLMN)), voutput_zero_point);
+ const int16x8_t vaccOPQRSTUV = vqaddq_s16(vcombine_s16(vqmovn_s32(vaccOPQR), vqmovn_s32(vaccSTUV)), voutput_zero_point);
+
+ int8x16_t vout0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc01234567), vqmovn_s16(vacc89ABCDEF));
+ int8x16_t voutGHIJKLMNOPQRSTUV = vcombine_s8(vqmovn_s16(vaccGHIJKLMN), vqmovn_s16(vaccOPQRSTUV));
+
+ vout0123456789ABCDEF = vmaxq_s8(vout0123456789ABCDEF, voutput_min);
+ voutGHIJKLMNOPQRSTUV = vmaxq_s8(voutGHIJKLMNOPQRSTUV, voutput_min);
+
+ vout0123456789ABCDEF = vminq_s8(vout0123456789ABCDEF, voutput_max);
+ voutGHIJKLMNOPQRSTUV = vminq_s8(voutGHIJKLMNOPQRSTUV, voutput_max);
+
+ vst1q_s8(output, vout0123456789ABCDEF); output += 16;
+ vst1q_s8(output, voutGHIJKLMNOPQRSTUV); output += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ const int8x8_t va01234567 = vld1_s8(input_a); input_a += 8;
+
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(va01234567, vget_low_s8(va_zero_point));
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vsubl_s8(va01234567, va_zero_point);
+ #endif
+
+ int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+
+ vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+ vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+
+ const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+
+ int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
+ vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
+ vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));
+
+ if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
+ vst1_s8(output, vout01234567); output += 8;
+ n -= 8 * sizeof(int8_t);
+ } else {
+ if (n & (4 * sizeof(int8_t))) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_s8(vout01234567), 0); output += 4;
+ vout01234567 = vext_s8(vout01234567, vout01234567, 4);
+ }
+ if (n & (2 * sizeof(int8_t))) {
+ vst1_lane_u16(__builtin_assume_aligned(output, 1), vreinterpret_u16_s8(vout01234567), 0); output += 2;
+ vout01234567 = vext_s8(vout01234567, vout01234567, 2);
+ }
+ if (n & (1 * sizeof(int8_t))) {
+ vst1_lane_s8(output, vout01234567, 0);
+ }
+ n = 0;
+ }
+ } while (n != 0);
+ }
+}
diff --git a/src/qs8-vaddc/gen/minmax-neon-ld64-x16.c b/src/qs8-vaddc/gen/minmax-neon-ld64-x16.c
index 501ddbc..3da548e 100644
--- a/src/qs8-vaddc/gen/minmax-neon-ld64-x16.c
+++ b/src/qs8-vaddc/gen/minmax-neon-ld64-x16.c
@@ -1,5 +1,5 @@
// Auto-generated file. Do not edit!
-// Template: src/qs8-vaddc/neon-ld64.c.in
+// Template: src/qs8-vaddc/neon.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
@@ -13,8 +13,6 @@
#include <xnnpack/vadd.h>
-#include <inttypes.h>
-
void xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x16(
size_t n,
diff --git a/src/qs8-vaddc/gen/minmax-neon-ld64-x24.c b/src/qs8-vaddc/gen/minmax-neon-ld64-x24.c
index 75eb36b..cdd09ff 100644
--- a/src/qs8-vaddc/gen/minmax-neon-ld64-x24.c
+++ b/src/qs8-vaddc/gen/minmax-neon-ld64-x24.c
@@ -1,5 +1,5 @@
// Auto-generated file. Do not edit!
-// Template: src/qs8-vaddc/neon-ld64.c.in
+// Template: src/qs8-vaddc/neon.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
@@ -13,8 +13,6 @@
#include <xnnpack/vadd.h>
-#include <inttypes.h>
-
void xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x24(
size_t n,
diff --git a/src/qs8-vaddc/gen/minmax-neon-ld64-x32.c b/src/qs8-vaddc/gen/minmax-neon-ld64-x32.c
index a28ee68..fb7e05a 100644
--- a/src/qs8-vaddc/gen/minmax-neon-ld64-x32.c
+++ b/src/qs8-vaddc/gen/minmax-neon-ld64-x32.c
@@ -1,5 +1,5 @@
// Auto-generated file. Do not edit!
-// Template: src/qs8-vaddc/neon-ld64.c.in
+// Template: src/qs8-vaddc/neon.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
@@ -13,8 +13,6 @@
#include <xnnpack/vadd.h>
-#include <inttypes.h>
-
void xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32(
size_t n,
diff --git a/src/qs8-vaddc/gen/minmax-neon-ld64-x8.c b/src/qs8-vaddc/gen/minmax-neon-ld64-x8.c
index e8d6da9..487d41e 100644
--- a/src/qs8-vaddc/gen/minmax-neon-ld64-x8.c
+++ b/src/qs8-vaddc/gen/minmax-neon-ld64-x8.c
@@ -1,5 +1,5 @@
// Auto-generated file. Do not edit!
-// Template: src/qs8-vaddc/neon-ld64.c.in
+// Template: src/qs8-vaddc/neon.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
@@ -13,8 +13,6 @@
#include <xnnpack/vadd.h>
-#include <inttypes.h>
-
void xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x8(
size_t n,
@@ -27,8 +25,8 @@
const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier);
const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
- const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
- const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+ const int8x8_t voutput_min = vld1_dup_s8(¶ms->neon.output_min);
+ const int8x8_t voutput_max = vld1_dup_s8(¶ms->neon.output_max);
const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point;
const int32_t vb = params->neon.b_multiplier;
@@ -49,9 +47,9 @@
int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
- vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
+ vout01234567 = vmax_s8(vout01234567, voutput_min);
- vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));
+ vout01234567 = vmin_s8(vout01234567, voutput_max);
vst1_s8(output, vout01234567); output += 8;
}
@@ -70,8 +68,8 @@
const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
int8x8_t vout01234567 = vqmovn_s16(vacc01234567);
- vout01234567 = vmax_s8(vout01234567, vget_low_s8(voutput_min));
- vout01234567 = vmin_s8(vout01234567, vget_low_s8(voutput_max));
+ vout01234567 = vmax_s8(vout01234567, voutput_min);
+ vout01234567 = vmin_s8(vout01234567, voutput_max);
if (n & (4 * sizeof(int8_t))) {
vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_s8(vout01234567), 0); output += 4;
diff --git a/src/qs8-vaddc/neon-ld64.c.in b/src/qs8-vaddc/neon.c.in
similarity index 61%
rename from src/qs8-vaddc/neon-ld64.c.in
rename to src/qs8-vaddc/neon.c.in
index d691355..2a391cc 100644
--- a/src/qs8-vaddc/neon-ld64.c.in
+++ b/src/qs8-vaddc/neon.c.in
@@ -4,7 +4,7 @@
// LICENSE file in the root directory of this source tree.
$assert DATATYPE in ["QS8", "QU8"]
-$assert BATCH_TILE % 8 == 0
+$assert BATCH_TILE % (16 if LD128 else 8) == 0
$assert BATCH_TILE >= 8
$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
#include <assert.h>
@@ -13,13 +13,12 @@
#include <xnnpack/vadd.h>
-#include <inttypes.h>
-
$XINT8_T = {"QS8": "int8_t", "QU8": "uint8_t"}[DATATYPE]
$XINT8X8_T = {"QS8": "int8x8_t", "QU8": "uint8x8_t"}[DATATYPE]
$XINT8X16_T = {"QS8": "int8x16_t", "QU8": "uint8x16_t"}[DATATYPE]
$VLD1_X8 = {"QS8": "vld1_s8", "QU8": "vld1_u8"}[DATATYPE]
+$VLD1Q_X8 = {"QS8": "vld1q_s8", "QU8": "vld1q_u8"}[DATATYPE]
$VLD1_DUP_X8 = {"QS8": "vld1_dup_s8", "QU8": "vld1_dup_u8"}[DATATYPE]
$VLD1Q_DUP_X8 = {"QS8": "vld1q_dup_s8", "QU8": "vld1q_dup_u8"}[DATATYPE]
$VST1_LANE_X8 = {"QS8": "vst1_lane_s8", "QU8": "vst1_lane_u8"}[DATATYPE]
@@ -35,33 +34,66 @@
$VCOMBINE_X8 = {"QS8": "vcombine_s8", "QU8": "vcombine_u8"}[DATATYPE]
$VREINTERPRET_U32_X8 = {"QS8": "vreinterpret_u32_s8", "QU8": "vreinterpret_u32_u8"}[DATATYPE]
$VREINTERPRET_U16_X8 = {"QS8": "vreinterpret_u16_s8", "QU8": "vreinterpret_u16_u8"}[DATATYPE]
-void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__neon_ld64_x${BATCH_TILE}(
+void xnn_${DATATYPE.lower()}_vaddc_minmax_ukernel__neon_${"ld128" if LD128 else "ld64"}_x${BATCH_TILE}(
size_t n,
const ${XINT8_T}* input_a,
const ${XINT8_T}* input_b,
${XINT8_T}* output,
const union xnn_${DATATYPE.lower()}_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
{
- const ${XINT8X8_T} va_zero_point = ${VLD1_DUP_X8}(¶ms->neon.a_zero_point);
+ $if LD128:
+ #if XNN_ARCH_ARM64
+ const ${XINT8X16_T} va_zero_point = ${VLD1Q_DUP_X8}(¶ms->neon.a_zero_point);
+ #else
+ const ${XINT8X8_T} va_zero_point = ${VLD1_DUP_X8}(¶ms->neon.a_zero_point);
+ #endif
+ $else:
+ const ${XINT8X8_T} va_zero_point = ${VLD1_DUP_X8}(¶ms->neon.a_zero_point);
const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier);
const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
- const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(¶ms->neon.output_min);
- const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(¶ms->neon.output_max);
+ $if BATCH_TILE >= 16:
+ const ${XINT8X16_T} voutput_min = ${VLD1Q_DUP_X8}(¶ms->neon.output_min);
+ const ${XINT8X16_T} voutput_max = ${VLD1Q_DUP_X8}(¶ms->neon.output_max);
+ $else:
+ const ${XINT8X8_T} voutput_min = ${VLD1_DUP_X8}(¶ms->neon.output_min);
+ const ${XINT8X8_T} voutput_max = ${VLD1_DUP_X8}(¶ms->neon.output_max);
const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point;
const int32_t vb = params->neon.b_multiplier;
const int32x4_t vbias = vdupq_n_s32(vxb * vb);
for (; n >= ${BATCH_TILE} * sizeof(${XINT8_T}); n -= ${BATCH_TILE} * sizeof(${XINT8_T})) {
- $for N in range(0, BATCH_TILE, 8):
- const ${XINT8X8_T} va${ABC[N:N+8]} = ${VLD1_X8}(input_a); input_a += 8;
+ $if LD128:
+ $for N in range(0, BATCH_TILE, 16):
+ const ${XINT8X16_T} va${ABC[N:N+16]} = ${VLD1Q_X8}(input_a); input_a += 16;
- $for N in range(0, BATCH_TILE, 8):
- $if DATATYPE == "QU8":
- const int16x8_t vxa${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[N:N+8]}, va_zero_point));
- $else:
- const int16x8_t vxa${ABC[N:N+8]} = vsubl_s8(va${ABC[N:N+8]}, va_zero_point);
+ #if XNN_ARCH_ARM64
+ $for N in range(0, BATCH_TILE, 16):
+ $if DATATYPE == "QU8":
+ const int16x8_t vxa${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va${ABC[N:N+16]}), vget_low_u8(va_zero_point)));
+ const int16x8_t vxa${ABC[N+8:N+16]} = vreinterpretq_s16_u16(vsubl_high_u8(va${ABC[N:N+16]}, va_zero_point));
+ $else:
+ const int16x8_t vxa${ABC[N:N+8]} = vsubl_s8(vget_low_s8(va${ABC[N:N+16]}), vget_low_s8(va_zero_point));
+ const int16x8_t vxa${ABC[N+8:N+16]} = vsubl_high_s8(va${ABC[N:N+16]}, va_zero_point);
+ #else // !XNN_ARCH_ARM64
+ $for N in range(0, BATCH_TILE, 16):
+ $if DATATYPE == "QU8":
+ const int16x8_t vxa${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va${ABC[N:N+16]}), va_zero_point));
+ const int16x8_t vxa${ABC[N+8:N+16]} = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va${ABC[N:N+16]}), va_zero_point));
+ $else:
+ const int16x8_t vxa${ABC[N:N+8]} = vsubl_s8(vget_low_s8(va${ABC[N:N+16]}), va_zero_point);
+ const int16x8_t vxa${ABC[N+8:N+16]} = vsubl_s8(vget_high_s8(va${ABC[N:N+16]}), va_zero_point);
+ #endif // XNN_ARCH_ARM64
+ $else:
+ $for N in range(0, BATCH_TILE, 8):
+ const ${XINT8X8_T} va${ABC[N:N+8]} = ${VLD1_X8}(input_a); input_a += 8;
+
+ $for N in range(0, BATCH_TILE, 8):
+ $if DATATYPE == "QU8":
+ const int16x8_t vxa${ABC[N:N+8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[N:N+8]}, va_zero_point));
+ $else:
+ const int16x8_t vxa${ABC[N:N+8]} = vsubl_s8(va${ABC[N:N+8]}, va_zero_point);
$for N in range(0, BATCH_TILE, 8):
int32x4_t vacc${ABC[N:N+4]} = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa${ABC[N:N+8]})), va_multiplier);
@@ -82,14 +114,18 @@
$for N in range(0, BATCH_TILE, 16):
$if N + 8 < BATCH_TILE:
vout${ABC[N:N+16]} = ${VMAXQ_X8}(vout${ABC[N:N+16]}, voutput_min);
- $else:
+ $elif BATCH_TILE >= 16:
vout${ABC[N:N+8]} = ${VMAX_X8}(vout${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_min));
+ $else:
+ vout${ABC[N:N+8]} = ${VMAX_X8}(vout${ABC[N:N+8]}, voutput_min);
$for N in range(0, BATCH_TILE, 16):
$if N + 8 < BATCH_TILE:
vout${ABC[N:N+16]} = ${VMINQ_X8}(vout${ABC[N:N+16]}, voutput_max);
- $else:
+ $elif BATCH_TILE >= 16:
vout${ABC[N:N+8]} = ${VMIN_X8}(vout${ABC[N:N+8]}, ${VGET_LOW_X8}(voutput_max));
+ $else:
+ vout${ABC[N:N+8]} = ${VMIN_X8}(vout${ABC[N:N+8]}, voutput_max);
$for N in range(0, BATCH_TILE, 16):
$if N + 8 < BATCH_TILE:
@@ -104,10 +140,24 @@
$else:
const ${XINT8X8_T} va${ABC[0:8]} = ${VLD1_X8}(input_a);
- $if DATATYPE == "QU8":
- const int16x8_t vxa${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[0:8]}, va_zero_point));
+ $if LD128:
+ $if DATATYPE == "QU8":
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[0:8]}, vget_low_u8(va_zero_point)));
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[0:8]}, va_zero_point));
+ #endif
+ $else:
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa${ABC[0:8]} = vsubl_s8(va${ABC[0:8]}, vget_low_s8(va_zero_point));
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa${ABC[0:8]} = vsubl_s8(va${ABC[0:8]}, va_zero_point);
+ #endif
$else:
- const int16x8_t vxa${ABC[0:8]} = vsubl_s8(va${ABC[0:8]}, va_zero_point);
+ $if DATATYPE == "QU8":
+ const int16x8_t vxa${ABC[0:8]} = vreinterpretq_s16_u16(vsubl_u8(va${ABC[0:8]}, va_zero_point));
+ $else:
+ const int16x8_t vxa${ABC[0:8]} = vsubl_s8(va${ABC[0:8]}, va_zero_point);
int32x4_t vacc${ABC[0:4]} = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa${ABC[0:8]})), va_multiplier);
int32x4_t vacc${ABC[4:8]} = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa${ABC[0:8]})), va_multiplier);
@@ -118,8 +168,12 @@
const int16x8_t vacc${ABC[0:8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${ABC[0:4]}), vqmovn_s32(vacc${ABC[4:8]})), voutput_zero_point);
${XINT8X8_T} vout${ABC[0:8]} = ${VQMOVXN_S16}(vacc${ABC[0:8]});
- vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_min));
- vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_max));
+ $if BATCH_TILE >= 16:
+ vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_min));
+ vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, ${VGET_LOW_X8}(voutput_max));
+ $else:
+ vout${ABC[0:8]} = ${VMAX_X8}(vout${ABC[0:8]}, voutput_min);
+ vout${ABC[0:8]} = ${VMIN_X8}(vout${ABC[0:8]}, voutput_max);
$if BATCH_TILE > 8:
if XNN_LIKELY(n >= (8 * sizeof(${XINT8_T}))) {
diff --git a/src/qu8-vadd/gen/minmax-neon-ld128-x16.c b/src/qu8-vadd/gen/minmax-neon-ld128-x16.c
new file mode 100644
index 0000000..6049320
--- /dev/null
+++ b/src/qu8-vadd/gen/minmax-neon-ld128-x16.c
@@ -0,0 +1,127 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vadd/neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16(
+ size_t n,
+ const uint8_t* input_a,
+ const uint8_t* input_b,
+ uint8_t* output,
+ const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ #if XNN_ARCH_ARM64
+ const uint8x16_t va_zero_point = vld1q_dup_u8(¶ms->neon.a_zero_point);
+ const uint8x16_t vb_zero_point = vld1q_dup_u8(¶ms->neon.b_zero_point);
+ #else
+ const uint8x8_t va_zero_point = vld1_dup_u8(¶ms->neon.a_zero_point);
+ const uint8x8_t vb_zero_point = vld1_dup_u8(¶ms->neon.b_zero_point);
+ #endif
+ const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier);
+ const int32x4_t vb_multiplier = vld1q_dup_s32(¶ms->neon.b_multiplier);
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.output_max);
+
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ const uint8x16_t va0123456789ABCDEF = vld1q_u8(input_a); input_a += 16;
+ const uint8x16_t vb0123456789ABCDEF = vld1q_u8(input_b); input_b += 16;
+
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va0123456789ABCDEF), vget_low_u8(va_zero_point)));
+ const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_high_u8(va0123456789ABCDEF, va_zero_point));
+ const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb0123456789ABCDEF), vget_low_u8(vb_zero_point)));
+ const int16x8_t vxb89ABCDEF = vreinterpretq_s16_u16(vsubl_high_u8(vb0123456789ABCDEF, vb_zero_point));
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va0123456789ABCDEF), va_zero_point));
+ const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va0123456789ABCDEF), va_zero_point));
+ const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(vb0123456789ABCDEF), vb_zero_point));
+ const int16x8_t vxb89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(vb0123456789ABCDEF), vb_zero_point));
+ #endif // XNN_ARCH_ARM64
+
+ int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc89AB = vmulq_s32(vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
+ int32x4_t vaccCDEF = vmulq_s32(vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);
+
+ vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
+ vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
+ vacc89AB = vmlaq_s32(vacc89AB, vmovl_s16(vget_low_s16(vxb89ABCDEF)), vb_multiplier);
+ vaccCDEF = vmlaq_s32(vaccCDEF, vmovl_s16(vget_high_s16(vxb89ABCDEF)), vb_multiplier);
+
+ vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+ vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+ vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
+ vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);
+
+ const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+ const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
+
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
+ const uint8x8_t vb01234567 = vld1_u8(input_b); input_b += 8;
+
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, vget_low_u8(va_zero_point)));
+ const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vget_low_u8(vb_zero_point)));
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
+ const int16x8_t vxb01234567 = vreinterpretq_s16_u16(vsubl_u8(vb01234567, vb_zero_point));
+ #endif // XNN_ARCH_ARM64
+
+ int32x4_t vacc0123 = vmulq_s32(vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc4567 = vmulq_s32(vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+
+ vacc0123 = vmlaq_s32(vacc0123, vmovl_s16(vget_low_s16(vxb01234567)), vb_multiplier);
+ vacc4567 = vmlaq_s32(vacc4567, vmovl_s16(vget_high_s16(vxb01234567)), vb_multiplier);
+
+ vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+ vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+
+ const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
+ vst1_u8(output, vout01234567); output += 8;
+ n -= 8 * sizeof(uint8_t);
+ } else {
+ if (n & (4 * sizeof(uint8_t))) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ vst1_lane_u16(__builtin_assume_aligned(output, 1), vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ vst1_lane_u8(output, vout01234567, 0);
+ }
+ n = 0;
+ }
+ } while (n != 0);
+ }
+}
diff --git a/src/qu8-vadd/gen/minmax-neon-ld64-x16.c b/src/qu8-vadd/gen/minmax-neon-ld64-x16.c
index 90b549e..51bcdd2 100644
--- a/src/qu8-vadd/gen/minmax-neon-ld64-x16.c
+++ b/src/qu8-vadd/gen/minmax-neon-ld64-x16.c
@@ -1,5 +1,5 @@
// Auto-generated file. Do not edit!
-// Template: src/qs8-vadd/neon-ld64.c.in
+// Template: src/qs8-vadd/neon.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
diff --git a/src/qu8-vadd/gen/minmax-neon-ld64-x8.c b/src/qu8-vadd/gen/minmax-neon-ld64-x8.c
index 9e49735..6c4ccbe 100644
--- a/src/qu8-vadd/gen/minmax-neon-ld64-x8.c
+++ b/src/qu8-vadd/gen/minmax-neon-ld64-x8.c
@@ -1,5 +1,5 @@
// Auto-generated file. Do not edit!
-// Template: src/qs8-vadd/neon-ld64.c.in
+// Template: src/qs8-vadd/neon.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
@@ -27,8 +27,8 @@
const int32x4_t vb_multiplier = vld1q_dup_s32(¶ms->neon.b_multiplier);
const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
- const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.output_min);
- const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.output_max);
+ const uint8x8_t voutput_min = vld1_dup_u8(¶ms->neon.output_min);
+ const uint8x8_t voutput_max = vld1_dup_u8(¶ms->neon.output_max);
for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
@@ -50,9 +50,9 @@
uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
- vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmax_u8(vout01234567, voutput_min);
- vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+ vout01234567 = vmin_u8(vout01234567, voutput_max);
vst1_u8(output, vout01234567); output += 8;
}
@@ -76,8 +76,8 @@
const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
- vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
- vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+ vout01234567 = vmax_u8(vout01234567, voutput_min);
+ vout01234567 = vmin_u8(vout01234567, voutput_max);
if (n & (4 * sizeof(uint8_t))) {
vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_u8(vout01234567), 0); output += 4;
diff --git a/src/qu8-vaddc/gen/minmax-neon-ld128-x16.c b/src/qu8-vaddc/gen/minmax-neon-ld128-x16.c
new file mode 100644
index 0000000..9073a5f
--- /dev/null
+++ b/src/qu8-vaddc/gen/minmax-neon-ld128-x16.c
@@ -0,0 +1,112 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-vaddc/neon.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/vadd.h>
+
+
+void xnn_qu8_vaddc_minmax_ukernel__neon_ld128_x16(
+ size_t n,
+ const uint8_t* input_a,
+ const uint8_t* input_b,
+ uint8_t* output,
+ const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN XNN_DISABLE_MSAN
+{
+ #if XNN_ARCH_ARM64
+ const uint8x16_t va_zero_point = vld1q_dup_u8(¶ms->neon.a_zero_point);
+ #else
+ const uint8x8_t va_zero_point = vld1_dup_u8(¶ms->neon.a_zero_point);
+ #endif
+ const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier);
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+ const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.output_min);
+ const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.output_max);
+
+ const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point;
+ const int32_t vb = params->neon.b_multiplier;
+ const int32x4_t vbias = vdupq_n_s32(vxb * vb);
+
+ for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
+ const uint8x16_t va0123456789ABCDEF = vld1q_u8(input_a); input_a += 16;
+
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va0123456789ABCDEF), vget_low_u8(va_zero_point)));
+ const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_high_u8(va0123456789ABCDEF, va_zero_point));
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(va0123456789ABCDEF), va_zero_point));
+ const int16x8_t vxa89ABCDEF = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(va0123456789ABCDEF), va_zero_point));
+ #endif // XNN_ARCH_ARM64
+
+ int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc89AB = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa89ABCDEF)), va_multiplier);
+ int32x4_t vaccCDEF = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa89ABCDEF)), va_multiplier);
+
+ vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+ vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+ vacc89AB = vrshlq_s32(vacc89AB, vright_shift);
+ vaccCDEF = vrshlq_s32(vaccCDEF, vright_shift);
+
+ const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+ const int16x8_t vacc89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc89AB), vqmovn_s32(vaccCDEF)), voutput_zero_point);
+
+ uint8x16_t vout0123456789ABCDEF = vcombine_u8(vqmovun_s16(vacc01234567), vqmovun_s16(vacc89ABCDEF));
+
+ vout0123456789ABCDEF = vmaxq_u8(vout0123456789ABCDEF, voutput_min);
+
+ vout0123456789ABCDEF = vminq_u8(vout0123456789ABCDEF, voutput_max);
+
+ vst1q_u8(output, vout0123456789ABCDEF); output += 16;
+ }
+ if XNN_UNLIKELY(n != 0) {
+ do {
+ const uint8x8_t va01234567 = vld1_u8(input_a); input_a += 8;
+
+ #if XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, vget_low_u8(va_zero_point)));
+ #else // !XNN_ARCH_ARM64
+ const int16x8_t vxa01234567 = vreinterpretq_s16_u16(vsubl_u8(va01234567, va_zero_point));
+ #endif
+
+ int32x4_t vacc0123 = vmlaq_s32(vbias, vmovl_s16(vget_low_s16(vxa01234567)), va_multiplier);
+ int32x4_t vacc4567 = vmlaq_s32(vbias, vmovl_s16(vget_high_s16(vxa01234567)), va_multiplier);
+
+ vacc0123 = vrshlq_s32(vacc0123, vright_shift);
+ vacc4567 = vrshlq_s32(vacc4567, vright_shift);
+
+ const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
+
+ uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
+ vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+
+ if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
+ vst1_u8(output, vout01234567); output += 8;
+ n -= 8 * sizeof(uint8_t);
+ } else {
+ if (n & (4 * sizeof(uint8_t))) {
+ vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_u8(vout01234567), 0); output += 4;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 4);
+ }
+ if (n & (2 * sizeof(uint8_t))) {
+ vst1_lane_u16(__builtin_assume_aligned(output, 1), vreinterpret_u16_u8(vout01234567), 0); output += 2;
+ vout01234567 = vext_u8(vout01234567, vout01234567, 2);
+ }
+ if (n & (1 * sizeof(uint8_t))) {
+ vst1_lane_u8(output, vout01234567, 0);
+ }
+ n = 0;
+ }
+ } while (n != 0);
+ }
+}
diff --git a/src/qu8-vaddc/gen/minmax-neon-ld64-x16.c b/src/qu8-vaddc/gen/minmax-neon-ld64-x16.c
index de82c6b..943ceff 100644
--- a/src/qu8-vaddc/gen/minmax-neon-ld64-x16.c
+++ b/src/qu8-vaddc/gen/minmax-neon-ld64-x16.c
@@ -1,5 +1,5 @@
// Auto-generated file. Do not edit!
-// Template: src/qs8-vaddc/neon-ld64.c.in
+// Template: src/qs8-vaddc/neon.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
@@ -13,8 +13,6 @@
#include <xnnpack/vadd.h>
-#include <inttypes.h>
-
void xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16(
size_t n,
diff --git a/src/qu8-vaddc/gen/minmax-neon-ld64-x8.c b/src/qu8-vaddc/gen/minmax-neon-ld64-x8.c
index 3fcc7b6..b63f5e4 100644
--- a/src/qu8-vaddc/gen/minmax-neon-ld64-x8.c
+++ b/src/qu8-vaddc/gen/minmax-neon-ld64-x8.c
@@ -1,5 +1,5 @@
// Auto-generated file. Do not edit!
-// Template: src/qs8-vaddc/neon-ld64.c.in
+// Template: src/qs8-vaddc/neon.c.in
// Generator: tools/xngen
//
// Copyright 2020 Google LLC
@@ -13,8 +13,6 @@
#include <xnnpack/vadd.h>
-#include <inttypes.h>
-
void xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x8(
size_t n,
@@ -27,8 +25,8 @@
const int32x4_t va_multiplier = vld1q_dup_s32(¶ms->neon.a_multiplier);
const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
- const uint8x16_t voutput_min = vld1q_dup_u8(¶ms->neon.output_min);
- const uint8x16_t voutput_max = vld1q_dup_u8(¶ms->neon.output_max);
+ const uint8x8_t voutput_min = vld1_dup_u8(¶ms->neon.output_min);
+ const uint8x8_t voutput_max = vld1_dup_u8(¶ms->neon.output_max);
const int32_t vxb = (int32_t) *input_b - (int32_t) params->neon.b_zero_point;
const int32_t vb = params->neon.b_multiplier;
@@ -49,9 +47,9 @@
uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
- vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
+ vout01234567 = vmax_u8(vout01234567, voutput_min);
- vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+ vout01234567 = vmin_u8(vout01234567, voutput_max);
vst1_u8(output, vout01234567); output += 8;
}
@@ -70,8 +68,8 @@
const int16x8_t vacc01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0123), vqmovn_s32(vacc4567)), voutput_zero_point);
uint8x8_t vout01234567 = vqmovun_s16(vacc01234567);
- vout01234567 = vmax_u8(vout01234567, vget_low_u8(voutput_min));
- vout01234567 = vmin_u8(vout01234567, vget_low_u8(voutput_max));
+ vout01234567 = vmax_u8(vout01234567, voutput_min);
+ vout01234567 = vmin_u8(vout01234567, voutput_max);
if (n & (4 * sizeof(uint8_t))) {
vst1_lane_u32(__builtin_assume_aligned(output, 1), vreinterpret_u32_u8(vout01234567), 0); output += 4;
diff --git a/src/xnnpack/vadd.h b/src/xnnpack/vadd.h
index 25e7067..bc00c32 100644
--- a/src/xnnpack/vadd.h
+++ b/src/xnnpack/vadd.h
@@ -30,6 +30,8 @@
DECLARE_QU8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qu8_vadd_minmax_ukernel__neon_ld64_x8)
DECLARE_QU8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16)
+DECLARE_QU8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16)
+
DECLARE_QU8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8)
DECLARE_QU8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x16)
@@ -64,6 +66,8 @@
DECLARE_QU8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x8)
DECLARE_QU8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16)
+DECLARE_QU8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qu8_vaddc_minmax_ukernel__neon_ld128_x16)
+
DECLARE_QU8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8)
DECLARE_QU8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16)
@@ -109,6 +113,9 @@
DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__neon_ld64_x24)
DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32)
+DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16)
+DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32)
+
DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8)
DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16)
DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x24)
@@ -161,6 +168,9 @@
DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x24)
DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32)
+DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16)
+DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32)
+
DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8)
DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16)
DECLARE_QS8_VADD_MINMAX_UKERNEL_FUNCTION(xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x24)
diff --git a/test/qs8-vadd-minmax.cc b/test/qs8-vadd-minmax.cc
index df6e955..5dc1d72 100644
--- a/test/qs8-vadd-minmax.cc
+++ b/test/qs8-vadd-minmax.cc
@@ -657,6 +657,326 @@
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, batch_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ VAddMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, batch_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, batch_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, batch_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, inplace_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, inplace_b) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, inplace_a_and_b) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, a_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (int32_t a_zero_point = -128; a_zero_point <= 127; a_zero_point += 51) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .a_zero_point(a_zero_point)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, b_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (int32_t b_zero_point = -128; b_zero_point <= 127; b_zero_point += 51) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .b_zero_point(b_zero_point)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, y_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (int32_t y_zero_point = -128; y_zero_point <= 127; y_zero_point += 51) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .y_zero_point(y_zero_point)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, a_scale) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float a_scale = 0.1f; a_scale <= 10.0f; a_scale *= 3.14f) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .a_scale(a_scale)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, b_scale) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float b_scale = 0.1f; b_scale <= 10.0f; b_scale *= 3.14f) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .b_scale(b_scale)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, y_scale) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float y_scale = 0.1f; y_scale <= 10.0f; y_scale *= 3.14f) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .y_scale(y_scale)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X16, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, batch_eq_32) {
+ TEST_REQUIRES_ARM_NEON;
+ VAddMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, batch_div_32) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, batch_lt_32) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, batch_gt_32) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, inplace_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, inplace_b) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, inplace_a_and_b) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .inplace_b(true)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, a_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (int32_t a_zero_point = -128; a_zero_point <= 127; a_zero_point += 51) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .a_zero_point(a_zero_point)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, b_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (int32_t b_zero_point = -128; b_zero_point <= 127; b_zero_point += 51) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .b_zero_point(b_zero_point)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, y_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (int32_t y_zero_point = -128; y_zero_point <= 127; y_zero_point += 51) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .y_zero_point(y_zero_point)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, a_scale) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float a_scale = 0.1f; a_scale <= 10.0f; a_scale *= 3.14f) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .a_scale(a_scale)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, b_scale) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float b_scale = 0.1f; b_scale <= 10.0f; b_scale *= 3.14f) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .b_scale(b_scale)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, y_scale) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float y_scale = 0.1f; y_scale <= 10.0f; y_scale *= 3.14f) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .y_scale(y_scale)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADD_MINMAX__NEON_LD128_X32, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QS8_VADD_MINMAX__SSE2_MUL16_LD64_X8, batch_eq_8) {
TEST_REQUIRES_X86_SSE2;
diff --git a/test/qs8-vadd-minmax.yaml b/test/qs8-vadd-minmax.yaml
index ba129dd..994a462 100644
--- a/test/qs8-vadd-minmax.yaml
+++ b/test/qs8-vadd-minmax.yaml
@@ -10,6 +10,10 @@
init: xnn_init_qs8_add_minmax_neon_params
- name: xnn_qs8_vadd_minmax_ukernel__neon_ld64_x32
init: xnn_init_qs8_add_minmax_neon_params
+- name: xnn_qs8_vadd_minmax_ukernel__neon_ld128_x16
+ init: xnn_init_qs8_add_minmax_neon_params
+- name: xnn_qs8_vadd_minmax_ukernel__neon_ld128_x32
+ init: xnn_init_qs8_add_minmax_neon_params
- name: xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x8
init: xnn_init_qs8_add_minmax_sse2_params
- name: xnn_qs8_vadd_minmax_ukernel__sse2_mul16_ld64_x16
diff --git a/test/qs8-vaddc-minmax.cc b/test/qs8-vaddc-minmax.cc
index 2d5c6a5..d7fe977 100644
--- a/test/qs8-vaddc-minmax.cc
+++ b/test/qs8-vaddc-minmax.cc
@@ -573,6 +573,284 @@
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, batch_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ VAddCMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, batch_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, batch_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, batch_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, inplace) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, a_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (int32_t a_zero_point = -128; a_zero_point <= 127; a_zero_point += 51) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .a_zero_point(a_zero_point)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, b_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (int32_t b_zero_point = -128; b_zero_point <= 127; b_zero_point += 51) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .b_zero_point(b_zero_point)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, y_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (int32_t y_zero_point = -128; y_zero_point <= 127; y_zero_point += 51) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .y_zero_point(y_zero_point)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, a_scale) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float a_scale = 0.1f; a_scale <= 10.0f; a_scale *= 3.14f) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .a_scale(a_scale)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, b_scale) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float b_scale = 0.1f; b_scale <= 10.0f; b_scale *= 3.14f) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .b_scale(b_scale)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, y_scale) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ for (float y_scale = 0.1f; y_scale <= 10.0f; y_scale *= 3.14f) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .y_scale(y_scale)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X16, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, batch_eq_32) {
+ TEST_REQUIRES_ARM_NEON;
+ VAddCMicrokernelTester()
+ .batch_size(32)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, batch_div_32) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 64; batch_size < 320; batch_size += 32) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, batch_lt_32) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size < 32; batch_size++) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, batch_gt_32) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 33; batch_size < 64; batch_size++) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, inplace) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, a_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (int32_t a_zero_point = -128; a_zero_point <= 127; a_zero_point += 51) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .a_zero_point(a_zero_point)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, b_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (int32_t b_zero_point = -128; b_zero_point <= 127; b_zero_point += 51) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .b_zero_point(b_zero_point)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, y_zero_point) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (int32_t y_zero_point = -128; y_zero_point <= 127; y_zero_point += 51) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .y_zero_point(y_zero_point)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, a_scale) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float a_scale = 0.1f; a_scale <= 10.0f; a_scale *= 3.14f) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .a_scale(a_scale)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, b_scale) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float b_scale = 0.1f; b_scale <= 10.0f; b_scale *= 3.14f) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .b_scale(b_scale)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, y_scale) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ for (float y_scale = 0.1f; y_scale <= 10.0f; y_scale *= 3.14f) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .y_scale(y_scale)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QS8_VADDC_MINMAX__NEON_LD128_X32, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 160; batch_size += 31) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32, xnn_init_qs8_add_minmax_neon_params);
+ }
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QS8_VADDC_MINMAX__SSE2_MUL16_LD64_X8, batch_eq_8) {
TEST_REQUIRES_X86_SSE2;
diff --git a/test/qs8-vaddc-minmax.yaml b/test/qs8-vaddc-minmax.yaml
index 4a455bd..d1e57ff 100644
--- a/test/qs8-vaddc-minmax.yaml
+++ b/test/qs8-vaddc-minmax.yaml
@@ -10,6 +10,10 @@
init: xnn_init_qs8_add_minmax_neon_params
- name: xnn_qs8_vaddc_minmax_ukernel__neon_ld64_x32
init: xnn_init_qs8_add_minmax_neon_params
+- name: xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x16
+ init: xnn_init_qs8_add_minmax_neon_params
+- name: xnn_qs8_vaddc_minmax_ukernel__neon_ld128_x32
+ init: xnn_init_qs8_add_minmax_neon_params
- name: xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8
init: xnn_init_qs8_add_minmax_sse2_params
- name: xnn_qs8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16
diff --git a/test/qu8-vadd-minmax.cc b/test/qu8-vadd-minmax.cc
index 9e1a307..eca7f9b 100644
--- a/test/qu8-vadd-minmax.cc
+++ b/test/qu8-vadd-minmax.cc
@@ -193,6 +193,94 @@
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QU8_VADD_MINMAX__NEON_LD128_X16, batch_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ VAddMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+
+ TEST(QU8_VADD_MINMAX__NEON_LD128_X16, batch_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QU8_VADD_MINMAX__NEON_LD128_X16, batch_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QU8_VADD_MINMAX__NEON_LD128_X16, batch_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QU8_VADD_MINMAX__NEON_LD128_X16, inplace_a) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .Test(xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QU8_VADD_MINMAX__NEON_LD128_X16, inplace_b) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_b(true)
+ .Test(xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QU8_VADD_MINMAX__NEON_LD128_X16, inplace_a_and_b) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace_a(true)
+ .inplace_b(true)
+ .Test(xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QU8_VADD_MINMAX__NEON_LD128_X16, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QU8_VADD_MINMAX__NEON_LD128_X16, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_VADD_MINMAX__SSE2_MUL16_LD64_X8, batch_eq_8) {
TEST_REQUIRES_X86_SSE2;
diff --git a/test/qu8-vadd-minmax.yaml b/test/qu8-vadd-minmax.yaml
index efc9dc2..53c88af 100644
--- a/test/qu8-vadd-minmax.yaml
+++ b/test/qu8-vadd-minmax.yaml
@@ -6,6 +6,8 @@
init: xnn_init_qu8_add_minmax_neon_params
- name: xnn_qu8_vadd_minmax_ukernel__neon_ld64_x16
init: xnn_init_qu8_add_minmax_neon_params
+- name: xnn_qu8_vadd_minmax_ukernel__neon_ld128_x16
+ init: xnn_init_qu8_add_minmax_neon_params
- name: xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x8
init: xnn_init_qu8_add_minmax_sse2_params
- name: xnn_qu8_vadd_minmax_ukernel__sse2_mul16_ld64_x16
diff --git a/test/qu8-vaddc-minmax.cc b/test/qu8-vaddc-minmax.cc
index b6121c4..6f2e8a8 100644
--- a/test/qu8-vaddc-minmax.cc
+++ b/test/qu8-vaddc-minmax.cc
@@ -151,6 +151,73 @@
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QU8_VADDC_MINMAX__NEON_LD128_X16, batch_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ VAddCMicrokernelTester()
+ .batch_size(16)
+ .Test(xnn_qu8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+
+ TEST(QU8_VADDC_MINMAX__NEON_LD128_X16, batch_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 32; batch_size < 160; batch_size += 16) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QU8_VADDC_MINMAX__NEON_LD128_X16, batch_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size < 16; batch_size++) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QU8_VADDC_MINMAX__NEON_LD128_X16, batch_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 17; batch_size < 32; batch_size++) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .Test(xnn_qu8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QU8_VADDC_MINMAX__NEON_LD128_X16, inplace) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .inplace(true)
+ .Test(xnn_qu8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QU8_VADDC_MINMAX__NEON_LD128_X16, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .qmin(128)
+ .Test(xnn_qu8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+
+ TEST(QU8_VADDC_MINMAX__NEON_LD128_X16, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t batch_size = 1; batch_size <= 80; batch_size += 15) {
+ VAddCMicrokernelTester()
+ .batch_size(batch_size)
+ .qmax(128)
+ .Test(xnn_qu8_vaddc_minmax_ukernel__neon_ld128_x16, xnn_init_qu8_add_minmax_neon_params);
+ }
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(QU8_VADDC_MINMAX__SSE2_MUL16_LD64_X8, batch_eq_8) {
TEST_REQUIRES_X86_SSE2;
diff --git a/test/qu8-vaddc-minmax.yaml b/test/qu8-vaddc-minmax.yaml
index b006309..7e5f10e 100644
--- a/test/qu8-vaddc-minmax.yaml
+++ b/test/qu8-vaddc-minmax.yaml
@@ -6,6 +6,8 @@
init: xnn_init_qu8_add_minmax_neon_params
- name: xnn_qu8_vaddc_minmax_ukernel__neon_ld64_x16
init: xnn_init_qu8_add_minmax_neon_params
+- name: xnn_qu8_vaddc_minmax_ukernel__neon_ld128_x16
+ init: xnn_init_qu8_add_minmax_neon_params
- name: xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x8
init: xnn_init_qu8_add_minmax_sse2_params
- name: xnn_qu8_vaddc_minmax_ukernel__sse2_mul16_ld64_x16