QS8 C2 Neon igemm
PiperOrigin-RevId: 357621434
diff --git a/BUILD.bazel b/BUILD.bazel
index 901211a..bb818a7 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -1754,6 +1754,22 @@
"src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c",
"src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c",
"src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c",
+ "src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c",
+ "src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c",
+ "src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c",
+ "src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c",
+ "src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c",
+ "src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c",
+ "src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c",
+ "src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c",
+ "src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c",
+ "src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c",
+ "src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c",
+ "src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c",
+ "src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c",
+ "src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c",
+ "src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c",
+ "src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c",
"src/qs8-requantization/fp32-neon.c",
"src/qs8-requantization/precise-neon.c",
"src/qs8-requantization/q31-neon.c",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c809691..05cb6d2 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1003,6 +1003,22 @@
src/qs8-igemm/gen/3x8c8-minmax-neon-mull-padal.c
src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c
src/qs8-igemm/gen/4x8c8-minmax-neon-mull-padal.c
+ src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
+ src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
+ src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
+ src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
+ src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
+ src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
+ src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
+ src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
+ src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
+ src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
+ src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
+ src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
+ src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
+ src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
+ src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
+ src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
src/qs8-requantization/fp32-neon.c
src/qs8-requantization/precise-neon.c
src/qs8-requantization/q31-neon.c
diff --git a/scripts/generate-qs8-igemm.sh b/scripts/generate-qs8-igemm.sh
index 78ef6b1..a2d60d3 100755
--- a/scripts/generate-qs8-igemm.sh
+++ b/scripts/generate-qs8-igemm.sh
@@ -35,6 +35,25 @@
tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=3 -D NR=16 -o src/qs8-igemm/gen/3x16c8-minmax-neon-mull-padal.c
tools/xngen src/qs8-igemm/c8-neon-mull-padal.c.in -D MR=4 -D NR=16 -o src/qs8-igemm/gen/4x16c8-minmax-neon-mull-padal.c
+### C2 micro-kernels
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=0 -D MR=1 -D NR=8 -o src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=0 -D MR=2 -D NR=8 -o src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=0 -D MR=3 -D NR=8 -o src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=0 -D MR=4 -D NR=8 -o src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=0 -D MR=1 -D NR=16 -o src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=0 -D MR=2 -D NR=16 -o src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=0 -D MR=3 -D NR=16 -o src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=0 -D MR=4 -D NR=16 -o src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
+
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=1 -D MR=1 -D NR=8 -o src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=1 -D MR=2 -D NR=8 -o src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=1 -D MR=3 -D NR=8 -o src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=1 -D MR=4 -D NR=8 -o src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=1 -D MR=1 -D NR=16 -o src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=1 -D MR=2 -D NR=16 -o src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=1 -D MR=3 -D NR=16 -o src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
+tools/xngen src/qs8-igemm/c2-neon-mull-padal-dup.c.in -D MLA=1 -D MR=4 -D NR=16 -o src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
+
### C4 micro-kernels
tools/xngen src/qs8-igemm/MRxNRc4-neondot.c.in -D MR=1 -D NR=8 -o src/qs8-igemm/gen/1x8c4-minmax-neondot.c
tools/xngen src/qs8-igemm/MRxNRc4-neondot.c.in -D MR=4 -D NR=8 -o src/qs8-igemm/gen/4x8c4-minmax-neondot.c
diff --git a/src/qs8-igemm/c2-neon-mull-padal-dup.c.in b/src/qs8-igemm/c2-neon-mull-padal-dup.c.in
new file mode 100644
index 0000000..08dc2a0
--- /dev/null
+++ b/src/qs8-igemm/c2-neon-mull-padal-dup.c.in
@@ -0,0 +1,311 @@
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+$assert NR % 8 == 0
+$assert 8 <= NR <= 16
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_${MR}x${NR}c2__neon_${"mlal" if MLA else "mull"}_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= ${MR});
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (${MR} * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ $for M in range(1, MR):
+ int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+ $if M % 2 == 0:
+ if XNN_UNPREDICTABLE(mr <= ${M}) {
+ c${M} = c${M-1};
+ }
+ $elif M + 1 == MR:
+ if XNN_UNPREDICTABLE(mr != ${M+1}) {
+ c${M} = c${M-1};
+ }
+ $else:
+ if XNN_UNPREDICTABLE(mr < ${M+1}) {
+ c${M} = c${M-1};
+ }
+
+ do {
+ $for N in range(0, NR, 4):
+ int32x4_t vacc0x${ABC[N:N+4]} = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ $for M in range(1, MR):
+ $for N in range(0, NR, 4):
+ int32x4_t vacc${M}x${ABC[N:N+4]} = vacc0x${ABC[N:N+4]};
+
+ size_t p = ks;
+ do {
+ $for M in range(MR):
+ const int8_t* restrict a${M} = a[${M}];
+ if XNN_UNPREDICTABLE(a${M} != zero) {
+ a${M} = (const int8_t*) ((uintptr_t) a${M} + a_offset);
+ }
+ a += ${MR};
+
+ size_t k = kc;
+
+ $if MLA:
+ while (k >= 16 * sizeof(int8_t)) {
+ $for M in range(MR):
+ const int8x8_t va${M}x0 = vld1_s8(a${M}); a${M} += 8;
+ const int8x8_t va${M}x1 = vld1_s8(a${M}); a${M} += 8;
+
+ $for K in range(4):
+ $for N in range(0, NR, 4):
+ const int8x8_t vb${ABC[N:N+4]}c${K}x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ $for K in range(4):
+ $for N in range(0, NR, 4):
+ $for M in range(MR):
+ int16x8_t vprod${M}x${ABC[N:N+4]}c${K} = vmull_s8(vb${ABC[N:N+4]}c${K}x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}x0), ${K})));
+ const int8x8_t vb${ABC[N:N+4]}c${K}x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ $for M in range(MR):
+ vprod${M}x${ABC[N:N+4]}c${K} = vmlal_s8(vprod${M}x${ABC[N:N+4]}c${K}, vb${ABC[N:N+4]}c${K}x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}x1), ${K})));
+ $for M in range(MR):
+ vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c${K});
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ ${"if" if MLA else "while"} (k >= 8 * sizeof(int8_t)) {
+ $for M in range(MR):
+ const int8x8_t va${M} = vld1_s8(a${M}); a${M} += 8;
+
+ $for K in range(4):
+ $for N in range(0, NR, 4):
+ const int8x8_t vb${ABC[N:N+4]}c${K} = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ $for K in range(4):
+ const int16x8_t vprod${M}x${ABC[N:N+4]}c${K} = vmull_s8(vb${ABC[N:N+4]}c${K}, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), ${K})));
+ $for K in range(4):
+ vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c${K});
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ $for M in range(MR):
+ const int8x8_t va${M} = vld1_s8(a${M}); a${M} = (const int8_t*) ((uintptr_t) a${M} + k);
+
+ $for N in range(0, NR, 4):
+ const int8x8_t vb${ABC[N:N+4]}c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ const int16x8_t vprod${M}x${ABC[N:N+4]}c0 = vmull_s8(vb${ABC[N:N+4]}c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 0)));
+ vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ $for N in range(0, NR, 4):
+ const int8x8_t vb${ABC[N:N+4]}c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ const int16x8_t vprod${M}x${ABC[N:N+4]}c1 = vmull_s8(vb${ABC[N:N+4]}c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 1)));
+ vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ $for N in range(0, NR, 4):
+ const int8x8_t vb${ABC[N:N+4]}c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ const int16x8_t vprod${M}x${ABC[N:N+4]}c2 = vmull_s8(vb${ABC[N:N+4]}c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 2)));
+ vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ $for N in range(0, NR, 4):
+ const int8x8_t vb${ABC[N:N+4]}c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ const int16x8_t vprod${M}x${ABC[N:N+4]}c3 = vmull_s8(vb${ABC[N:N+4]}c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va${M}), 3)));
+ vacc${M}x${ABC[N:N+4]} = vpadalq_s16(vacc${M}x${ABC[N:N+4]}, vprod${M}x${ABC[N:N+4]}c3);
+ }
+ }
+ }
+ }
+ p -= ${MR} * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vqrdmulhq_s32(vacc${M}x${ABC[N:N+4]}, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vsraq_n_s32(vacc${M}x${ABC[N:N+4]}, vbicq_s32(vacc${M}x${ABC[N:N+4]}, vzero_shift_mask), 31);
+
+ $for M in range(MR):
+ $for N in range(0, NR, 4):
+ vacc${M}x${ABC[N:N+4]} = vrshlq_s32(vacc${M}x${ABC[N:N+4]}, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ $for M in range(MR):
+ $for N in range(0, NR, 8):
+ const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vacc${M}x${ABC[N+4:N+8]}), voutput_zero_point);
+
+ $for M in range(MR):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ int8x16_t vout${M}x${ABC[N:N+16]} = vqmovn_high_s16(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vacc${M}x${ABC[N+8:N+16]});
+ $elif M % 2 == 1:
+ int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vqmovn_high_s16(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vacc${M}x${ABC[N:N+8]});
+ $elif M + 1 == MR:
+ int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#else
+ $for M in range(MR):
+ $for N in range(0, NR, 8):
+ const int16x8_t vacc${M}x${ABC[N:N+8]} = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc${M}x${ABC[N:N+4]}), vqmovn_s32(vacc${M}x${ABC[N+4:N+8]})), voutput_zero_point);
+
+ $for M in range(MR):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ int8x16_t vout${M}x${ABC[N:N+16]} = vcombine_s8(vqmovn_s16(vacc${M}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N+8:N+16]}));
+ $elif M % 2 == 1:
+ int8x16_t vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vcombine_s8(vqmovn_s16(vacc${M-1}x${ABC[N:N+8]}), vqmovn_s16(vacc${M}x${ABC[N:N+8]}));
+ $elif M + 1 == MR:
+ int8x8_t vout${M}x${ABC[N:N+8]} = vqmovn_s16(vacc${M}x${ABC[N:N+8]});
+#endif
+ $if NR == 8 and MR == 1:
+ const int8x8_t voutput_min = vld1_dup_s8(¶ms->neon.output_min);
+ const int8x8_t voutput_max = vld1_dup_s8(¶ms->neon.output_max);
+ $else:
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ $for M in reversed(range(MR)):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ vout${M}x${ABC[N:N+16]} = vmaxq_s8(vout${M}x${ABC[N:N+16]}, voutput_min);
+ $elif M % 2 == 1:
+ vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vmaxq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_min);
+ $elif M + 1 == MR:
+ $if NR == 8 and MR == 1:
+ vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, voutput_min);
+ $else:
+ vout${M}x${ABC[N:N+8]} = vmax_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_min));
+
+ $for M in reversed(range(MR)):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ vout${M}x${ABC[N:N+16]} = vminq_s8(vout${M}x${ABC[N:N+16]}, voutput_max);
+ $elif M % 2 == 1:
+ vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]} = vminq_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}, voutput_max);
+ $elif M + 1 == MR:
+ $if NR == 8 and MR == 1:
+ vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, voutput_max);
+ $else:
+ vout${M}x${ABC[N:N+8]} = vmin_s8(vout${M}x${ABC[N:N+8]}, vget_low_s8(voutput_max));
+
+ if (nc >= ${NR}) {
+ $for M in reversed(range(MR)):
+ $for N in range(0, NR, 16):
+ $if N + 8 < NR:
+ vst1q_s8(c${M} + ${N}, vout${M}x${ABC[N:N+16]});
+ $elif M % 2 == 1:
+ vst1_s8(c${M} + ${N}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+ vst1_s8(c${M-1} + ${N}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]}));
+ $elif M + 1 == MR:
+ vst1_s8(c${M} + ${N}, vout${M}x${ABC[N:N+8]});
+
+ $for M in reversed(range(MR)):
+ c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= ${NR};
+ } else {
+ $if NR == 16:
+ $for M in range(MR):
+ $if M % 2 == 1:
+ int8x16_t vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_low_s8(vout${M-1}x0123456789ABCDEF), vget_low_s8(vout${M}x0123456789ABCDEF));
+ $elif M + 1 == MR:
+ int8x8_t vout${M}x01234567 = vget_low_s8(vout${M}x0123456789ABCDEF);
+ if (nc & 8) {
+ $for M in reversed(range(MR)):
+ $if M % 2 == 1:
+ vst1_s8(c${M}, vget_high_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M} += 8;
+ vst1_s8(c${M-1}, vget_low_s8(vout${M-1}x${ABC[N:N+8]}_${M}x${ABC[N:N+8]})); c${M-1} += 8;
+ $elif M + 1 == MR:
+ vst1_s8(c${M}, vout${M}x${ABC[N:N+8]}); c${M} += 8;
+ $for M in reversed(range(MR)):
+ $if M % 2 == 1:
+ vout${M-1}x01234567_${M}x01234567 = vcombine_s8(vget_high_s8(vout${M-1}x0123456789ABCDEF), vget_high_s8(vout${M}x0123456789ABCDEF));
+ $elif M + 1 == MR:
+ vout${M}x01234567 = vget_high_s8(vout${M}x0123456789ABCDEF);
+ }
+ if (nc & 4) {
+ $for M in reversed(range(MR)):
+ $if M % 2 == 1:
+ vst1q_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 2); c${M} += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u32_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 4;
+ $elif M + 1 == MR:
+ vst1_lane_u32(__builtin_assume_aligned(c${M}, 1), vreinterpret_u32_s8(vout${M}x01234567), 0); c${M} += 4;
+ $for M in reversed(range(MR)):
+ $if M % 2 == 1:
+ vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 4);
+ $elif M + 1 == MR:
+ vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 4);
+ }
+ if (nc & 2) {
+ $for M in reversed(range(MR)):
+ $if M % 2 == 1:
+ vst1q_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 4); c${M} += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c${M-1}, 1), vreinterpretq_u16_s8(vout${M-1}x01234567_${M}x01234567), 0); c${M-1} += 2;
+ $elif M + 1 == MR:
+ vst1_lane_u16(__builtin_assume_aligned(c${M}, 1), vreinterpret_u16_s8(vout${M}x01234567), 0); c${M} += 2;
+ $for M in reversed(range(MR)):
+ $if M % 2 == 1:
+ vout${M-1}x01234567_${M}x01234567 = vextq_s8(vout${M-1}x01234567_${M}x01234567, vout${M-1}x01234567_${M}x01234567, 2);
+ $elif M + 1 == MR:
+ vout${M}x01234567 = vext_s8(vout${M}x01234567, vout${M}x01234567, 2);
+ }
+ if (nc & 1) {
+ $for M in reversed(range(MR)):
+ $if M % 2 == 1:
+ vst1q_lane_s8(c${M}, vout${M-1}x01234567_${M}x01234567, 8);
+ vst1q_lane_s8(c${M-1}, vout${M-1}x01234567_${M}x01234567, 0);
+ $elif M + 1 == MR:
+ vst1_lane_s8(c${M}, vout${M}x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..21f0a45
--- /dev/null
+++ b/src/qs8-igemm/gen/1x16c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,340 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = kc;
+
+ while (k >= 16 * sizeof(int8_t)) {
+ const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+
+ const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ if (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ }
+ }
+ }
+ }
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+ vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+ vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+ vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+ vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+ if (nc & 8) {
+ vst1_s8(c0, vout0x01234567); c0 += 8;
+ vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+ }
+ if (nc & 4) {
+ vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_s8(c0, vout0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..f384462
--- /dev/null
+++ b/src/qs8-igemm/gen/1x16c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,252 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = kc;
+
+
+ while (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ }
+ }
+ }
+ }
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+ vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+ vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+ vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+ vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ int8x8_t vout0x01234567 = vget_low_s8(vout0x0123456789ABCDEF);
+ if (nc & 8) {
+ vst1_s8(c0, vout0x01234567); c0 += 8;
+ vout0x01234567 = vget_high_s8(vout0x0123456789ABCDEF);
+ }
+ if (nc & 4) {
+ vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_s8(c0, vout0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..0f4c1cf
--- /dev/null
+++ b/src/qs8-igemm/gen/1x8c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,237 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = kc;
+
+ while (k >= 16 * sizeof(int8_t)) {
+ const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+
+ const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ if (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ }
+ }
+ }
+ }
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+
+ int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+ int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+ const int8x8_t voutput_min = vld1_dup_s8(¶ms->neon.output_min);
+ const int8x8_t voutput_max = vld1_dup_s8(¶ms->neon.output_max);
+
+ vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+ vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_s8(c0 + 0, vout0x01234567);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_s8(c0, vout0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..d8d8367
--- /dev/null
+++ b/src/qs8-igemm/gen/1x8c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,189 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = kc;
+
+
+ while (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ }
+ }
+ }
+ }
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+
+ int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+
+ int8x8_t vout0x01234567 = vqmovn_s16(vacc0x01234567);
+#endif
+ const int8x8_t voutput_min = vld1_dup_s8(¶ms->neon.output_min);
+ const int8x8_t voutput_max = vld1_dup_s8(¶ms->neon.output_max);
+
+ vout0x01234567 = vmax_s8(vout0x01234567, voutput_min);
+
+ vout0x01234567 = vmin_s8(vout0x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_s8(c0 + 0, vout0x01234567);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpret_u32_s8(vout0x01234567), 0); c0 += 4;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpret_u16_s8(vout0x01234567), 0); c0 += 2;
+ vout0x01234567 = vext_s8(vout0x01234567, vout0x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_s8(c0, vout0x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..8fa5b3c
--- /dev/null
+++ b/src/qs8-igemm/gen/2x16c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,494 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ c1 = c0;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc1x89AB = vacc0x89AB;
+ int32x4_t vacc1xCDEF = vacc0xCDEF;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ a += 2;
+
+ size_t k = kc;
+
+ while (k >= 16 * sizeof(int8_t)) {
+ const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+
+ const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x89ABc1 = vmlal_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x89ABc2 = vmlal_s8(vprod1x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1xCDEFc2 = vmlal_s8(vprod1xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x89ABc3 = vmlal_s8(vprod1x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1xCDEFc3 = vmlal_s8(vprod1xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ if (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+ }
+ }
+ }
+ }
+ p -= 2 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+ vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+ vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+ vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+ vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+ vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+ vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+ vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+ vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+ vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+ vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+ vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+ vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+ vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+ vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+ vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+ int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+ int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+ vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+ vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+ vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+ vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+ if (nc & 8) {
+ vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+ vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+ vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+ }
+ if (nc & 4) {
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..867c6c0
--- /dev/null
+++ b/src/qs8-igemm/gen/2x16c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,356 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ c1 = c0;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc1x89AB = vacc0x89AB;
+ int32x4_t vacc1xCDEF = vacc0xCDEF;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ a += 2;
+
+ size_t k = kc;
+
+
+ while (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+ }
+ }
+ }
+ }
+ p -= 2 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+ vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+ vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+ vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+ vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+ vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+ vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+ vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+ vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+ vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+ vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+ vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+ vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+ vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+ vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+ vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+ int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+ int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+ vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+ vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+ vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+ vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+ if (nc & 8) {
+ vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+ vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+ vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+ }
+ if (nc & 4) {
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..478e2c5
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,320 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ c1 = c0;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ a += 2;
+
+ size_t k = kc;
+
+ while (k >= 16 * sizeof(int8_t)) {
+ const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+
+ const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ if (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ }
+ }
+ }
+ }
+ p -= 2 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+ vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+ vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+ vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+ vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+ vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..03fa52d
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,246 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ c1 = c0;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ a += 2;
+
+ size_t k = kc;
+
+
+ while (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ }
+ }
+ }
+ }
+ p -= 2 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+ vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+ vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+ vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+ vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+ vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..6d304e2
--- /dev/null
+++ b/src/qs8-igemm/gen/3x16c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,652 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 3);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (3 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc1x89AB = vacc0x89AB;
+ int32x4_t vacc1xCDEF = vacc0xCDEF;
+ int32x4_t vacc2x0123 = vacc0x0123;
+ int32x4_t vacc2x4567 = vacc0x4567;
+ int32x4_t vacc2x89AB = vacc0x89AB;
+ int32x4_t vacc2xCDEF = vacc0xCDEF;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const int8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ a += 3;
+
+ size_t k = kc;
+
+ while (k >= 16 * sizeof(int8_t)) {
+ const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+ const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+
+ const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+ const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+ const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+ const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+ int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+ const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+ int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+ const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+ const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+ const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x89ABc1 = vmlal_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+ int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+ const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+ int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+ const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+ const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+ const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x89ABc2 = vmlal_s8(vprod1x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vprod2x89ABc2 = vmlal_s8(vprod2x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+ int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+ const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1xCDEFc2 = vmlal_s8(vprod1xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vprod2xCDEFc2 = vmlal_s8(vprod2xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+ int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+ const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+ const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+ const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x89ABc3 = vmlal_s8(vprod1x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vprod2x89ABc3 = vmlal_s8(vprod2x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+ int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+ const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1xCDEFc3 = vmlal_s8(vprod1xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vprod2xCDEFc3 = vmlal_s8(vprod2xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ if (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+ const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+ const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+ const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+ const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+ const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+ const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+ }
+ }
+ }
+ }
+ p -= 3 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+ vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+ vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+ vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+ vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+ vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+ vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+ vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+ vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+ vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+ vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+ vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+ vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+ vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+ vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+ vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+ vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+ vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+ vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+ vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+ vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+ vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+ vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+ vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+ vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+ vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+ vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+ vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+ int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+ int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+ int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+ int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+ vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+ vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+ vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+ vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+ vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+ vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+ vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+ int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+ if (nc & 8) {
+ vst1_s8(c2, vout2x01234567); c2 += 8;
+ vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+ vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+ vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+ vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+ }
+ if (nc & 4) {
+ vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_s8(c2, vout2x01234567, 0);
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..53161f3
--- /dev/null
+++ b/src/qs8-igemm/gen/3x16c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,464 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 3);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (3 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc1x89AB = vacc0x89AB;
+ int32x4_t vacc1xCDEF = vacc0xCDEF;
+ int32x4_t vacc2x0123 = vacc0x0123;
+ int32x4_t vacc2x4567 = vacc0x4567;
+ int32x4_t vacc2x89AB = vacc0x89AB;
+ int32x4_t vacc2xCDEF = vacc0xCDEF;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const int8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ a += 3;
+
+ size_t k = kc;
+
+
+ while (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+ const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+ const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+ const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+ const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+ const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+ const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+ }
+ }
+ }
+ }
+ p -= 3 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+ vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+ vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+ vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+ vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+ vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+ vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+ vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+ vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+ vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+ vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+ vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+ vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+ vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+ vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+ vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+ vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+ vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+ vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+ vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+ vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+ vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+ vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+ vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+ vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+ vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+ vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+ vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+ int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+ int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+ int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+ int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+ vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+ vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+ vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+ vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+ vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+ vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+ vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+ int8x8_t vout2x01234567 = vget_low_s8(vout2x0123456789ABCDEF);
+ if (nc & 8) {
+ vst1_s8(c2, vout2x01234567); c2 += 8;
+ vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+ vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+ vout2x01234567 = vget_high_s8(vout2x0123456789ABCDEF);
+ vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+ }
+ if (nc & 4) {
+ vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_s8(c2, vout2x01234567, 0);
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..fac3944
--- /dev/null
+++ b/src/qs8-igemm/gen/3x8c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,409 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 3);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (3 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc2x0123 = vacc0x0123;
+ int32x4_t vacc2x4567 = vacc0x4567;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const int8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ a += 3;
+
+ size_t k = kc;
+
+ while (k >= 16 * sizeof(int8_t)) {
+ const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+ const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+
+ const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+ const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+ const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+ const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+ const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+ const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+ const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+ const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+ const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ if (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+ const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ }
+ }
+ }
+ }
+ p -= 3 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+ vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+ vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+ vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+ vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+ vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+ vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+ vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+ vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+ vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+ int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+ int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+ vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+ vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+ vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_s8(c2 + 0, vout2x01234567);
+ vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+ vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_s8(c2, vout2x01234567, 0);
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..3f7f6b3
--- /dev/null
+++ b/src/qs8-igemm/gen/3x8c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,309 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 3);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (3 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc2x0123 = vacc0x0123;
+ int32x4_t vacc2x4567 = vacc0x4567;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const int8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ a += 3;
+
+ size_t k = kc;
+
+
+ while (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+ const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ }
+ }
+ }
+ }
+ p -= 3 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+ vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+ vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+ vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+ vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+ vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+ vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+ vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+ vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+ vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+ int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+ int8x8_t vout2x01234567 = vqmovn_s16(vacc2x01234567);
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout2x01234567 = vmax_s8(vout2x01234567, vget_low_s8(voutput_min));
+ vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+ vout2x01234567 = vmin_s8(vout2x01234567, vget_low_s8(voutput_max));
+ vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_s8(c2 + 0, vout2x01234567);
+ vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+ vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpret_u32_s8(vout2x01234567), 0); c2 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 4);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpret_u16_s8(vout2x01234567), 0); c2 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout2x01234567 = vext_s8(vout2x01234567, vout2x01234567, 2);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1_lane_s8(c2, vout2x01234567, 0);
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..f6d24d2
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,806 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc1x89AB = vacc0x89AB;
+ int32x4_t vacc1xCDEF = vacc0xCDEF;
+ int32x4_t vacc2x0123 = vacc0x0123;
+ int32x4_t vacc2x4567 = vacc0x4567;
+ int32x4_t vacc2x89AB = vacc0x89AB;
+ int32x4_t vacc2xCDEF = vacc0xCDEF;
+ int32x4_t vacc3x0123 = vacc0x0123;
+ int32x4_t vacc3x4567 = vacc0x4567;
+ int32x4_t vacc3x89AB = vacc0x89AB;
+ int32x4_t vacc3xCDEF = vacc0xCDEF;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const int8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ const int8_t* restrict a3 = a[3];
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+
+ while (k >= 16 * sizeof(int8_t)) {
+ const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+ const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+ const int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
+ const int8x8_t va3x1 = vld1_s8(a3); a3 += 8;
+
+ const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+ int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+ const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+ vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+ int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+ int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+ const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+ vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+ int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+ int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+ const int8x8_t vb89ABc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc0 = vmlal_s8(vprod0x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x89ABc0 = vmlal_s8(vprod1x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vprod2x89ABc0 = vmlal_s8(vprod2x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+ vprod3x89ABc0 = vmlal_s8(vprod3x89ABc0, vb89ABc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+ int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+ int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+ const int8x8_t vbCDEFc0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc0 = vmlal_s8(vprod0xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1xCDEFc0 = vmlal_s8(vprod1xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vprod2xCDEFc0 = vmlal_s8(vprod2xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+ vprod3xCDEFc0 = vmlal_s8(vprod3xCDEFc0, vbCDEFc0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+ int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+ int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+ const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+ vprod3x0123c1 = vmlal_s8(vprod3x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+ int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+ int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+ const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+ vprod3x4567c1 = vmlal_s8(vprod3x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+ int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+ int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+ const int8x8_t vb89ABc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc1 = vmlal_s8(vprod0x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x89ABc1 = vmlal_s8(vprod1x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vprod2x89ABc1 = vmlal_s8(vprod2x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+ vprod3x89ABc1 = vmlal_s8(vprod3x89ABc1, vb89ABc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+ int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+ int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+ const int8x8_t vbCDEFc1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc1 = vmlal_s8(vprod0xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1xCDEFc1 = vmlal_s8(vprod1xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vprod2xCDEFc1 = vmlal_s8(vprod2xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+ vprod3xCDEFc1 = vmlal_s8(vprod3xCDEFc1, vbCDEFc1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+ int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+ int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+ const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+ vprod3x0123c2 = vmlal_s8(vprod3x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+ int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+ int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+ const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+ vprod3x4567c2 = vmlal_s8(vprod3x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+ int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+ int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+ const int8x8_t vb89ABc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc2 = vmlal_s8(vprod0x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x89ABc2 = vmlal_s8(vprod1x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vprod2x89ABc2 = vmlal_s8(vprod2x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+ vprod3x89ABc2 = vmlal_s8(vprod3x89ABc2, vb89ABc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+ int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+ int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+ const int8x8_t vbCDEFc2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc2 = vmlal_s8(vprod0xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1xCDEFc2 = vmlal_s8(vprod1xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vprod2xCDEFc2 = vmlal_s8(vprod2xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+ vprod3xCDEFc2 = vmlal_s8(vprod3xCDEFc2, vbCDEFc2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+ int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+ int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+ const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+ vprod3x0123c3 = vmlal_s8(vprod3x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+ int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+ int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+ const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+ vprod3x4567c3 = vmlal_s8(vprod3x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+ int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+ int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+ const int8x8_t vb89ABc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x89ABc3 = vmlal_s8(vprod0x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x89ABc3 = vmlal_s8(vprod1x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vprod2x89ABc3 = vmlal_s8(vprod2x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+ vprod3x89ABc3 = vmlal_s8(vprod3x89ABc3, vb89ABc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
+ int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+ int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+ const int8x8_t vbCDEFc3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0xCDEFc3 = vmlal_s8(vprod0xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1xCDEFc3 = vmlal_s8(vprod1xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vprod2xCDEFc3 = vmlal_s8(vprod2xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+ vprod3xCDEFc3 = vmlal_s8(vprod3xCDEFc3, vbCDEFc3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ if (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+ const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+ const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+ const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+ const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+ const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
+ const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+ const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+ const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+ const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+ const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+ const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+ const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+ const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+ const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+ const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+ const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+ const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+ const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+ const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+ const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+ const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+ const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+ const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+ const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+ const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+ const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+ const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
+ const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
+ }
+ }
+ }
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+ vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+ vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+ vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+ vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+ vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+ vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+ vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+ vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+ vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+ vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+ vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+ vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+ vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+ vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+ vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+ vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+ vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+ vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+ vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+ vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+ vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+ vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+ vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+ vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+ vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+ vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+ vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+ vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+ vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+ vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+ vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+ vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+ vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+ vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+ vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+ vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+ vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+ vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+ vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+ const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+ int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+ int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+ int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+ const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+ int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+ int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+ int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+ vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+ vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+ vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+ vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+ vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+ vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+ vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+ vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+ vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+ vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+ int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+ if (nc & 8) {
+ vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+ vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+ vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+ vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+ vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+ vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+ }
+ if (nc & 4) {
+ vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+ vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..d7a469b
--- /dev/null
+++ b/src/qs8-igemm/gen/4x16c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,568 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x89AB = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0xCDEF = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc1x89AB = vacc0x89AB;
+ int32x4_t vacc1xCDEF = vacc0xCDEF;
+ int32x4_t vacc2x0123 = vacc0x0123;
+ int32x4_t vacc2x4567 = vacc0x4567;
+ int32x4_t vacc2x89AB = vacc0x89AB;
+ int32x4_t vacc2xCDEF = vacc0xCDEF;
+ int32x4_t vacc3x0123 = vacc0x0123;
+ int32x4_t vacc3x4567 = vacc0x4567;
+ int32x4_t vacc3x89AB = vacc0x89AB;
+ int32x4_t vacc3xCDEF = vacc0xCDEF;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const int8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ const int8_t* restrict a3 = a[3];
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+
+
+ while (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+ const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+ const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+ const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+ const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+ const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
+ const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+ const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+ const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod0x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc0);
+ const int16x8_t vprod0xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc0);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ const int16x8_t vprod1x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc0);
+ const int16x8_t vprod1xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc0);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ const int16x8_t vprod2x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc0);
+ const int16x8_t vprod2xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc0);
+ const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+ const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+ const int16x8_t vprod3x89ABc0 = vmull_s8(vb89ABc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc0);
+ const int16x8_t vprod3xCDEFc0 = vmull_s8(vbCDEFc0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod0x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc1);
+ const int16x8_t vprod0xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc1);
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ const int16x8_t vprod1x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc1);
+ const int16x8_t vprod1xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc1);
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ const int16x8_t vprod2x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc1);
+ const int16x8_t vprod2xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc1);
+ const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+ const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+ const int16x8_t vprod3x89ABc1 = vmull_s8(vb89ABc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc1);
+ const int16x8_t vprod3xCDEFc1 = vmull_s8(vbCDEFc1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod0x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc2);
+ const int16x8_t vprod0xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc2);
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ const int16x8_t vprod1x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc2);
+ const int16x8_t vprod1xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc2);
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ const int16x8_t vprod2x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc2);
+ const int16x8_t vprod2xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc2);
+ const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+ const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+ const int16x8_t vprod3x89ABc2 = vmull_s8(vb89ABc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc2);
+ const int16x8_t vprod3xCDEFc2 = vmull_s8(vbCDEFc2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb89ABc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vbCDEFc3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod0x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x89AB = vpadalq_s16(vacc0x89AB, vprod0x89ABc3);
+ const int16x8_t vprod0xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0xCDEF = vpadalq_s16(vacc0xCDEF, vprod0xCDEFc3);
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod1x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x89AB = vpadalq_s16(vacc1x89AB, vprod1x89ABc3);
+ const int16x8_t vprod1xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1xCDEF = vpadalq_s16(vacc1xCDEF, vprod1xCDEFc3);
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ const int16x8_t vprod2x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x89AB = vpadalq_s16(vacc2x89AB, vprod2x89ABc3);
+ const int16x8_t vprod2xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2xCDEF = vpadalq_s16(vacc2xCDEF, vprod2xCDEFc3);
+ const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+ const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+ const int16x8_t vprod3x89ABc3 = vmull_s8(vb89ABc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x89AB = vpadalq_s16(vacc3x89AB, vprod3x89ABc3);
+ const int16x8_t vprod3xCDEFc3 = vmull_s8(vbCDEFc3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3xCDEF = vpadalq_s16(vacc3xCDEF, vprod3xCDEFc3);
+ }
+ }
+ }
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc0x89AB = vqrdmulhq_s32(vacc0x89AB, vmultiplier);
+ vacc0xCDEF = vqrdmulhq_s32(vacc0xCDEF, vmultiplier);
+ vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+ vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+ vacc1x89AB = vqrdmulhq_s32(vacc1x89AB, vmultiplier);
+ vacc1xCDEF = vqrdmulhq_s32(vacc1xCDEF, vmultiplier);
+ vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+ vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+ vacc2x89AB = vqrdmulhq_s32(vacc2x89AB, vmultiplier);
+ vacc2xCDEF = vqrdmulhq_s32(vacc2xCDEF, vmultiplier);
+ vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+ vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+ vacc3x89AB = vqrdmulhq_s32(vacc3x89AB, vmultiplier);
+ vacc3xCDEF = vqrdmulhq_s32(vacc3xCDEF, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc0x89AB = vsraq_n_s32(vacc0x89AB, vbicq_s32(vacc0x89AB, vzero_shift_mask), 31);
+ vacc0xCDEF = vsraq_n_s32(vacc0xCDEF, vbicq_s32(vacc0xCDEF, vzero_shift_mask), 31);
+ vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+ vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+ vacc1x89AB = vsraq_n_s32(vacc1x89AB, vbicq_s32(vacc1x89AB, vzero_shift_mask), 31);
+ vacc1xCDEF = vsraq_n_s32(vacc1xCDEF, vbicq_s32(vacc1xCDEF, vzero_shift_mask), 31);
+ vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+ vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+ vacc2x89AB = vsraq_n_s32(vacc2x89AB, vbicq_s32(vacc2x89AB, vzero_shift_mask), 31);
+ vacc2xCDEF = vsraq_n_s32(vacc2xCDEF, vbicq_s32(vacc2xCDEF, vzero_shift_mask), 31);
+ vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+ vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+ vacc3x89AB = vsraq_n_s32(vacc3x89AB, vbicq_s32(vacc3x89AB, vzero_shift_mask), 31);
+ vacc3xCDEF = vsraq_n_s32(vacc3xCDEF, vbicq_s32(vacc3xCDEF, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc0x89AB = vrshlq_s32(vacc0x89AB, vright_shift);
+ vacc0xCDEF = vrshlq_s32(vacc0xCDEF, vright_shift);
+ vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+ vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+ vacc1x89AB = vrshlq_s32(vacc1x89AB, vright_shift);
+ vacc1xCDEF = vrshlq_s32(vacc1xCDEF, vright_shift);
+ vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+ vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+ vacc2x89AB = vrshlq_s32(vacc2x89AB, vright_shift);
+ vacc2xCDEF = vrshlq_s32(vacc2xCDEF, vright_shift);
+ vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+ vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+ vacc3x89AB = vrshlq_s32(vacc3x89AB, vright_shift);
+ vacc3xCDEF = vrshlq_s32(vacc3xCDEF, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x89AB), vacc0xCDEF), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x89AB), vacc1xCDEF), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x89AB), vacc2xCDEF), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+ const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x89AB), vacc3xCDEF), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc0x89ABCDEF);
+ int8x16_t vout1x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc1x01234567), vacc1x89ABCDEF);
+ int8x16_t vout2x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc2x89ABCDEF);
+ int8x16_t vout3x0123456789ABCDEF = vqmovn_high_s16(vqmovn_s16(vacc3x01234567), vacc3x89ABCDEF);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc0x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x89AB), vqmovn_s32(vacc0xCDEF)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc1x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x89AB), vqmovn_s32(vacc1xCDEF)), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+ const int16x8_t vacc2x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x89AB), vqmovn_s32(vacc2xCDEF)), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+ const int16x8_t vacc3x89ABCDEF = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x89AB), vqmovn_s32(vacc3xCDEF)), voutput_zero_point);
+
+ int8x16_t vout0x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc0x89ABCDEF));
+ int8x16_t vout1x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc1x01234567), vqmovn_s16(vacc1x89ABCDEF));
+ int8x16_t vout2x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc2x89ABCDEF));
+ int8x16_t vout3x0123456789ABCDEF = vcombine_s8(vqmovn_s16(vacc3x01234567), vqmovn_s16(vacc3x89ABCDEF));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout3x0123456789ABCDEF = vmaxq_s8(vout3x0123456789ABCDEF, voutput_min);
+ vout2x0123456789ABCDEF = vmaxq_s8(vout2x0123456789ABCDEF, voutput_min);
+ vout1x0123456789ABCDEF = vmaxq_s8(vout1x0123456789ABCDEF, voutput_min);
+ vout0x0123456789ABCDEF = vmaxq_s8(vout0x0123456789ABCDEF, voutput_min);
+
+ vout3x0123456789ABCDEF = vminq_s8(vout3x0123456789ABCDEF, voutput_max);
+ vout2x0123456789ABCDEF = vminq_s8(vout2x0123456789ABCDEF, voutput_max);
+ vout1x0123456789ABCDEF = vminq_s8(vout1x0123456789ABCDEF, voutput_max);
+ vout0x0123456789ABCDEF = vminq_s8(vout0x0123456789ABCDEF, voutput_max);
+
+ if (nc >= 16) {
+ vst1q_s8(c3 + 0, vout3x0123456789ABCDEF);
+ vst1q_s8(c2 + 0, vout2x0123456789ABCDEF);
+ vst1q_s8(c1 + 0, vout1x0123456789ABCDEF);
+ vst1q_s8(c0 + 0, vout0x0123456789ABCDEF);
+
+ c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 16;
+ } else {
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vget_low_s8(vout0x0123456789ABCDEF), vget_low_s8(vout1x0123456789ABCDEF));
+ int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vget_low_s8(vout2x0123456789ABCDEF), vget_low_s8(vout3x0123456789ABCDEF));
+ if (nc & 8) {
+ vst1_s8(c3, vget_high_s8(vout2x01234567_3x01234567)); c3 += 8;
+ vst1_s8(c2, vget_low_s8(vout2x01234567_3x01234567)); c2 += 8;
+ vst1_s8(c1, vget_high_s8(vout0x01234567_1x01234567)); c1 += 8;
+ vst1_s8(c0, vget_low_s8(vout0x01234567_1x01234567)); c0 += 8;
+ vout2x01234567_3x01234567 = vcombine_s8(vget_high_s8(vout2x0123456789ABCDEF), vget_high_s8(vout3x0123456789ABCDEF));
+ vout0x01234567_1x01234567 = vcombine_s8(vget_high_s8(vout0x0123456789ABCDEF), vget_high_s8(vout1x0123456789ABCDEF));
+ }
+ if (nc & 4) {
+ vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+ vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c b/src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
new file mode 100644
index 0000000..d135354
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8c2-minmax-neon-mlal-padal-dup.c
@@ -0,0 +1,492 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc2x0123 = vacc0x0123;
+ int32x4_t vacc2x4567 = vacc0x4567;
+ int32x4_t vacc3x0123 = vacc0x0123;
+ int32x4_t vacc3x4567 = vacc0x4567;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const int8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ const int8_t* restrict a3 = a[3];
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+
+ while (k >= 16 * sizeof(int8_t)) {
+ const int8x8_t va0x0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va0x1 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1x0 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va1x1 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va2x0 = vld1_s8(a2); a2 += 8;
+ const int8x8_t va2x1 = vld1_s8(a2); a2 += 8;
+ const int8x8_t va3x0 = vld1_s8(a3); a3 += 8;
+ const int8x8_t va3x1 = vld1_s8(a3); a3 += 8;
+
+ const int8x8_t vb0123c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3x0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+ int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+ const int8x8_t vb0123c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c0 = vmlal_s8(vprod0x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x0123c0 = vmlal_s8(vprod1x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vprod2x0123c0 = vmlal_s8(vprod2x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+ vprod3x0123c0 = vmlal_s8(vprod3x0123c0, vb0123c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+ int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 0)));
+ int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 0)));
+ int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 0)));
+ int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 0)));
+ const int8x8_t vb4567c0x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c0 = vmlal_s8(vprod0x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 0)));
+ vprod1x4567c0 = vmlal_s8(vprod1x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 0)));
+ vprod2x4567c0 = vmlal_s8(vprod2x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 0)));
+ vprod3x4567c0 = vmlal_s8(vprod3x4567c0, vb4567c0x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+ int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+ int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+ const int8x8_t vb0123c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c1 = vmlal_s8(vprod0x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x0123c1 = vmlal_s8(vprod1x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vprod2x0123c1 = vmlal_s8(vprod2x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+ vprod3x0123c1 = vmlal_s8(vprod3x0123c1, vb0123c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+ int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 1)));
+ int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 1)));
+ int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 1)));
+ int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 1)));
+ const int8x8_t vb4567c1x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c1 = vmlal_s8(vprod0x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 1)));
+ vprod1x4567c1 = vmlal_s8(vprod1x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 1)));
+ vprod2x4567c1 = vmlal_s8(vprod2x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 1)));
+ vprod3x4567c1 = vmlal_s8(vprod3x4567c1, vb4567c1x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+ int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+ int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+ const int8x8_t vb0123c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c2 = vmlal_s8(vprod0x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x0123c2 = vmlal_s8(vprod1x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vprod2x0123c2 = vmlal_s8(vprod2x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+ vprod3x0123c2 = vmlal_s8(vprod3x0123c2, vb0123c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+ int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 2)));
+ int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 2)));
+ int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 2)));
+ int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 2)));
+ const int8x8_t vb4567c2x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c2 = vmlal_s8(vprod0x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 2)));
+ vprod1x4567c2 = vmlal_s8(vprod1x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 2)));
+ vprod2x4567c2 = vmlal_s8(vprod2x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 2)));
+ vprod3x4567c2 = vmlal_s8(vprod3x4567c2, vb4567c2x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+ int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+ int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+ const int8x8_t vb0123c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x0123c3 = vmlal_s8(vprod0x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x0123c3 = vmlal_s8(vprod1x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vprod2x0123c3 = vmlal_s8(vprod2x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+ vprod3x0123c3 = vmlal_s8(vprod3x0123c3, vb0123c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+ int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x0), 3)));
+ int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x0), 3)));
+ int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x0), 3)));
+ int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3x0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x0), 3)));
+ const int8x8_t vb4567c3x1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ vprod0x4567c3 = vmlal_s8(vprod0x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0x1), 3)));
+ vprod1x4567c3 = vmlal_s8(vprod1x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1x1), 3)));
+ vprod2x4567c3 = vmlal_s8(vprod2x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2x1), 3)));
+ vprod3x4567c3 = vmlal_s8(vprod3x4567c3, vb4567c3x1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3x1), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+
+ k -= 16 * sizeof(int8_t);
+ }
+
+ if (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+ const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+ const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+ const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+ const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+ const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+ const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+ const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+ const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+ }
+ }
+ }
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+ vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+ vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+ vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+ vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+ vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+ vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+ vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+ vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+ vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+ vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+ vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+ vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+ vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+ vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+ vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+ int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+ int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+ vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+ vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+ vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+ vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+ vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+ vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+ c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+ vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c b/src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
new file mode 100644
index 0000000..8afdcf0
--- /dev/null
+++ b/src/qs8-igemm/gen/4x8c2-minmax-neon-mull-padal-dup.c
@@ -0,0 +1,366 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/c2-neon-mull-padal-dup.c.in
+// Generator: tools/xngen
+//
+// Copyright 2021 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <arm_neon.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 4);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (4 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+ int8_t* c3 = (int8_t*) ((uintptr_t) c2 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 4) {
+ c3 = c2;
+ }
+
+ do {
+ int32x4_t vacc0x0123 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc0x4567 = vld1q_s32(w); w = (const void*) ((uintptr_t) w + 4 * sizeof(int32_t));
+ int32x4_t vacc1x0123 = vacc0x0123;
+ int32x4_t vacc1x4567 = vacc0x4567;
+ int32x4_t vacc2x0123 = vacc0x0123;
+ int32x4_t vacc2x4567 = vacc0x4567;
+ int32x4_t vacc3x0123 = vacc0x0123;
+ int32x4_t vacc3x4567 = vacc0x4567;
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const int8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ const int8_t* restrict a3 = a[3];
+ if XNN_UNPREDICTABLE(a3 != zero) {
+ a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+ }
+ a += 4;
+
+ size_t k = kc;
+
+
+ while (k >= 8 * sizeof(int8_t)) {
+ const int8x8_t va0 = vld1_s8(a0); a0 += 8;
+ const int8x8_t va1 = vld1_s8(a1); a1 += 8;
+ const int8x8_t va2 = vld1_s8(a2); a2 += 8;
+ const int8x8_t va3 = vld1_s8(a3); a3 += 8;
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+ const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+
+ k -= 8 * sizeof(int8_t);
+ }
+
+ if XNN_UNLIKELY(k != 0) {
+ const int8x8_t va0 = vld1_s8(a0); a0 = (const int8_t*) ((uintptr_t) a0 + k);
+ const int8x8_t va1 = vld1_s8(a1); a1 = (const int8_t*) ((uintptr_t) a1 + k);
+ const int8x8_t va2 = vld1_s8(a2); a2 = (const int8_t*) ((uintptr_t) a2 + k);
+ const int8x8_t va3 = vld1_s8(a3); a3 = (const int8_t*) ((uintptr_t) a3 + k);
+
+ const int8x8_t vb0123c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c0 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c0);
+ const int16x8_t vprod0x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 0)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c0);
+ const int16x8_t vprod1x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c0);
+ const int16x8_t vprod1x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 0)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c0);
+ const int16x8_t vprod2x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c0);
+ const int16x8_t vprod2x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 0)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c0);
+ const int16x8_t vprod3x0123c0 = vmull_s8(vb0123c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c0);
+ const int16x8_t vprod3x4567c0 = vmull_s8(vb4567c0, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 0)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c0);
+
+ if (k > 2 * sizeof(int8_t)) {
+ const int8x8_t vb0123c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c1 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c1);
+ const int16x8_t vprod0x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 1)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c1);
+ const int16x8_t vprod1x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c1);
+ const int16x8_t vprod1x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 1)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c1);
+ const int16x8_t vprod2x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c1);
+ const int16x8_t vprod2x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 1)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c1);
+ const int16x8_t vprod3x0123c1 = vmull_s8(vb0123c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c1);
+ const int16x8_t vprod3x4567c1 = vmull_s8(vb4567c1, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 1)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c1);
+
+ if (k > 4 * sizeof(int8_t)) {
+ const int8x8_t vb0123c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c2 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c2);
+ const int16x8_t vprod0x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 2)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c2);
+ const int16x8_t vprod1x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c2);
+ const int16x8_t vprod1x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 2)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c2);
+ const int16x8_t vprod2x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c2);
+ const int16x8_t vprod2x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 2)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c2);
+ const int16x8_t vprod3x0123c2 = vmull_s8(vb0123c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c2);
+ const int16x8_t vprod3x4567c2 = vmull_s8(vb4567c2, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 2)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c2);
+
+ if (k > 6 * sizeof(int8_t)) {
+ const int8x8_t vb0123c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+ const int8x8_t vb4567c3 = vld1_s8(w); w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
+
+ const int16x8_t vprod0x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x0123 = vpadalq_s16(vacc0x0123, vprod0x0123c3);
+ const int16x8_t vprod0x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va0), 3)));
+ vacc0x4567 = vpadalq_s16(vacc0x4567, vprod0x4567c3);
+ const int16x8_t vprod1x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x0123 = vpadalq_s16(vacc1x0123, vprod1x0123c3);
+ const int16x8_t vprod1x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va1), 3)));
+ vacc1x4567 = vpadalq_s16(vacc1x4567, vprod1x4567c3);
+ const int16x8_t vprod2x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x0123 = vpadalq_s16(vacc2x0123, vprod2x0123c3);
+ const int16x8_t vprod2x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va2), 3)));
+ vacc2x4567 = vpadalq_s16(vacc2x4567, vprod2x4567c3);
+ const int16x8_t vprod3x0123c3 = vmull_s8(vb0123c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x0123 = vpadalq_s16(vacc3x0123, vprod3x0123c3);
+ const int16x8_t vprod3x4567c3 = vmull_s8(vb4567c3, vreinterpret_s8_s16(vdup_lane_s16(vreinterpret_s16_s8(va3), 3)));
+ vacc3x4567 = vpadalq_s16(vacc3x4567, vprod3x4567c3);
+ }
+ }
+ }
+ }
+ p -= 4 * sizeof(void*);
+ } while (p != 0);
+
+ const int32x4_t vmultiplier = vld1q_dup_s32(¶ms->neon.multiplier);
+ vacc0x0123 = vqrdmulhq_s32(vacc0x0123, vmultiplier);
+ vacc0x4567 = vqrdmulhq_s32(vacc0x4567, vmultiplier);
+ vacc1x0123 = vqrdmulhq_s32(vacc1x0123, vmultiplier);
+ vacc1x4567 = vqrdmulhq_s32(vacc1x4567, vmultiplier);
+ vacc2x0123 = vqrdmulhq_s32(vacc2x0123, vmultiplier);
+ vacc2x4567 = vqrdmulhq_s32(vacc2x4567, vmultiplier);
+ vacc3x0123 = vqrdmulhq_s32(vacc3x0123, vmultiplier);
+ vacc3x4567 = vqrdmulhq_s32(vacc3x4567, vmultiplier);
+
+ const int32x4_t vright_shift = vld1q_dup_s32(¶ms->neon.right_shift);
+ const int32x4_t vzero_shift_mask = vreinterpretq_s32_u32(vceqq_s32(vright_shift, vmovq_n_s32(0)));
+ vacc0x0123 = vsraq_n_s32(vacc0x0123, vbicq_s32(vacc0x0123, vzero_shift_mask), 31);
+ vacc0x4567 = vsraq_n_s32(vacc0x4567, vbicq_s32(vacc0x4567, vzero_shift_mask), 31);
+ vacc1x0123 = vsraq_n_s32(vacc1x0123, vbicq_s32(vacc1x0123, vzero_shift_mask), 31);
+ vacc1x4567 = vsraq_n_s32(vacc1x4567, vbicq_s32(vacc1x4567, vzero_shift_mask), 31);
+ vacc2x0123 = vsraq_n_s32(vacc2x0123, vbicq_s32(vacc2x0123, vzero_shift_mask), 31);
+ vacc2x4567 = vsraq_n_s32(vacc2x4567, vbicq_s32(vacc2x4567, vzero_shift_mask), 31);
+ vacc3x0123 = vsraq_n_s32(vacc3x0123, vbicq_s32(vacc3x0123, vzero_shift_mask), 31);
+ vacc3x4567 = vsraq_n_s32(vacc3x4567, vbicq_s32(vacc3x4567, vzero_shift_mask), 31);
+
+ vacc0x0123 = vrshlq_s32(vacc0x0123, vright_shift);
+ vacc0x4567 = vrshlq_s32(vacc0x4567, vright_shift);
+ vacc1x0123 = vrshlq_s32(vacc1x0123, vright_shift);
+ vacc1x4567 = vrshlq_s32(vacc1x4567, vright_shift);
+ vacc2x0123 = vrshlq_s32(vacc2x0123, vright_shift);
+ vacc2x4567 = vrshlq_s32(vacc2x4567, vright_shift);
+ vacc3x0123 = vrshlq_s32(vacc3x0123, vright_shift);
+ vacc3x4567 = vrshlq_s32(vacc3x4567, vright_shift);
+
+ const int16x8_t voutput_zero_point = vld1q_dup_s16(¶ms->neon.output_zero_point);
+#if XNN_ARCH_ARM64
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc0x0123), vacc0x4567), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc1x0123), vacc1x4567), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc2x0123), vacc2x4567), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vqmovn_high_s32(vqmovn_s32(vacc3x0123), vacc3x4567), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vqmovn_high_s16(vqmovn_s16(vacc0x01234567), vacc1x01234567);
+ int8x16_t vout2x01234567_3x01234567 = vqmovn_high_s16(vqmovn_s16(vacc2x01234567), vacc3x01234567);
+#else
+ const int16x8_t vacc0x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc0x0123), vqmovn_s32(vacc0x4567)), voutput_zero_point);
+ const int16x8_t vacc1x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc1x0123), vqmovn_s32(vacc1x4567)), voutput_zero_point);
+ const int16x8_t vacc2x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc2x0123), vqmovn_s32(vacc2x4567)), voutput_zero_point);
+ const int16x8_t vacc3x01234567 = vqaddq_s16(vcombine_s16(vqmovn_s32(vacc3x0123), vqmovn_s32(vacc3x4567)), voutput_zero_point);
+
+ int8x16_t vout0x01234567_1x01234567 = vcombine_s8(vqmovn_s16(vacc0x01234567), vqmovn_s16(vacc1x01234567));
+ int8x16_t vout2x01234567_3x01234567 = vcombine_s8(vqmovn_s16(vacc2x01234567), vqmovn_s16(vacc3x01234567));
+#endif
+ const int8x16_t voutput_min = vld1q_dup_s8(¶ms->neon.output_min);
+ const int8x16_t voutput_max = vld1q_dup_s8(¶ms->neon.output_max);
+
+ vout2x01234567_3x01234567 = vmaxq_s8(vout2x01234567_3x01234567, voutput_min);
+ vout0x01234567_1x01234567 = vmaxq_s8(vout0x01234567_1x01234567, voutput_min);
+
+ vout2x01234567_3x01234567 = vminq_s8(vout2x01234567_3x01234567, voutput_max);
+ vout0x01234567_1x01234567 = vminq_s8(vout0x01234567_1x01234567, voutput_max);
+
+ if (nc >= 8) {
+ vst1_s8(c3 + 0, vget_high_s8(vout2x01234567_3x01234567));
+ vst1_s8(c2 + 0, vget_low_s8(vout2x01234567_3x01234567));
+ vst1_s8(c1 + 0, vget_high_s8(vout0x01234567_1x01234567));
+ vst1_s8(c0 + 0, vget_low_s8(vout0x01234567_1x01234567));
+
+ c3 = (int8_t*) ((uintptr_t) c3 + cn_stride);
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ vst1q_lane_u32(__builtin_assume_aligned(c3, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 2); c3 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c2, 1), vreinterpretq_u32_s8(vout2x01234567_3x01234567), 0); c2 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c1, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 2); c1 += 4;
+ vst1q_lane_u32(__builtin_assume_aligned(c0, 1), vreinterpretq_u32_s8(vout0x01234567_1x01234567), 0); c0 += 4;
+ vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 4);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 4);
+ }
+ if (nc & 2) {
+ vst1q_lane_u16(__builtin_assume_aligned(c3, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 4); c3 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c2, 1), vreinterpretq_u16_s8(vout2x01234567_3x01234567), 0); c2 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c1, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 4); c1 += 2;
+ vst1q_lane_u16(__builtin_assume_aligned(c0, 1), vreinterpretq_u16_s8(vout0x01234567_1x01234567), 0); c0 += 2;
+ vout2x01234567_3x01234567 = vextq_s8(vout2x01234567_3x01234567, vout2x01234567_3x01234567, 2);
+ vout0x01234567_1x01234567 = vextq_s8(vout0x01234567_1x01234567, vout0x01234567_1x01234567, 2);
+ }
+ if (nc & 1) {
+ vst1q_lane_s8(c3, vout2x01234567_3x01234567, 8);
+ vst1q_lane_s8(c2, vout2x01234567_3x01234567, 0);
+ vst1q_lane_s8(c1, vout0x01234567_1x01234567, 8);
+ vst1q_lane_s8(c0, vout0x01234567_1x01234567, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 6534747..e43cd87 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -347,6 +347,26 @@
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c8__neon_mull_padal)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup)
+
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup)
+
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_6x8c4__neondot)
diff --git a/test/qs8-igemm-minmax.cc b/test/qs8-igemm-minmax.cc
index 8a5a0e0..617efb1 100644
--- a/test/qs8-igemm-minmax.cc
+++ b/test/qs8-igemm-minmax.cc
@@ -7511,6 +7511,7494 @@
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(43)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 1; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(43)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 2; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(127)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 3; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(127)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MULL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(8)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(43)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 1; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(43)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 2; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(127)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 3; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(127)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_eq_8_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(8)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_lt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_lt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 8; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 9; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, k_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 16; k <= 80; k += 8) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 40; k += 9) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MULL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(8)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 1; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(8)
+ .k(16)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 2; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(8)
+ .k(16)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(16)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(251)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 3; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(251)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(8)
+ .k(16)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(16)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(8)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_gt_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 9; n < 16; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, n_div_8_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 16; n <= 24; n += 8) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 8; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(11)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X8C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(8)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(8)
+ .k(16)
+ .cm_stride(11)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 1; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 1; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(83)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_1X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(1)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(1)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 2; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 2; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(163)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_2X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(2)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(2)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 3; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(251)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 3; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(251)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_3X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(3)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(3)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_m) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(16)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_eq_16_subtile_n) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(16)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_lt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_lt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k < 16; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 17; k < 32; k++) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, k_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 32; k <= 160; k += 16) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_strided_cn) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(n)
+ .k(k)
+ .cn_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, small_kernel_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .ks(3)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_gt_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 17; n < 32; n++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, n_div_16_small_kernel) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t n = 32; n <= 48; n += 16) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, strided_cm_subtile) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ for (uint32_t m = 1; m <= 4; m++) {
+ for (uint32_t n = 1; n <= 16; n++) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(m)
+ .n(n)
+ .k(k)
+ .cm_stride(19)
+ .iterations(1)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, a_offset) {
+ TEST_REQUIRES_ARM_NEON;
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, zero) {
+ TEST_REQUIRES_ARM_NEON;
+ for (uint32_t mz = 0; mz < 4; mz++) {
+ for (size_t k = 1; k <= 80; k += 17) {
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(k)
+ .ks(3)
+ .a_offset(331)
+ .zero_index(mz)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+ }
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, qmin) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmin(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, qmax) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .qmax(128)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+
+ TEST(QS8_IGEMM_MINMAX_4X16C2__NEON_MLAL_PADAL_DUP, strided_cm) {
+ TEST_REQUIRES_ARM_NEON;
+ GemmMicrokernelTester()
+ .mr(4)
+ .nr(16)
+ .kr(2)
+ .sr(1)
+ .m(4)
+ .n(16)
+ .k(16)
+ .cm_stride(19)
+ .Test(xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup);
+ }
+#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
+
+
+#if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(QS8_IGEMM_MINMAX_1X8C4__NEONDOT, k_eq_8) {
TEST_REQUIRES_ARM_NEON_DOT;
GemmMicrokernelTester()
diff --git a/test/qs8-igemm-minmax.yaml b/test/qs8-igemm-minmax.yaml
index 25ecb2b..b5139f3 100644
--- a/test/qs8-igemm-minmax.yaml
+++ b/test/qs8-igemm-minmax.yaml
@@ -34,6 +34,38 @@
k-block: 16
- name: xnn_qs8_igemm_minmax_ukernel_4x16c8__neon_mull_padal
k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mull_padal_dup
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mull_padal_dup
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mull_padal_dup
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mull_padal_dup
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mull_padal_dup
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mull_padal_dup
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mull_padal_dup
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mull_padal_dup
+ k-block: 8
+- name: xnn_qs8_igemm_minmax_ukernel_1x8c2__neon_mlal_padal_dup
+ k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_2x8c2__neon_mlal_padal_dup
+ k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_3x8c2__neon_mlal_padal_dup
+ k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_4x8c2__neon_mlal_padal_dup
+ k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_1x16c2__neon_mlal_padal_dup
+ k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_2x16c2__neon_mlal_padal_dup
+ k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_3x16c2__neon_mlal_padal_dup
+ k-block: 16
+- name: xnn_qs8_igemm_minmax_ukernel_4x16c2__neon_mlal_padal_dup
+ k-block: 16
- name: xnn_qs8_igemm_minmax_ukernel_1x8c4__neondot
k-block: 8
- name: xnn_qs8_igemm_minmax_ukernel_4x8c4__neondot