AVX2 version of QS8 GEMM and IGEMM microkernels
PiperOrigin-RevId: 324543693
diff --git a/src/qs8-gemm/MRx4c2-minmax-sse.c.in b/src/qs8-gemm/MRx4c2-minmax-sse.c.in
index a6bced4..3af144f 100644
--- a/src/qs8-gemm/MRx4c2-minmax-sse.c.in
+++ b/src/qs8-gemm/MRx4c2-minmax-sse.c.in
@@ -4,6 +4,7 @@
// LICENSE file in the root directory of this source tree.
$SSE_HEADER = {2: "emmintrin.h", 3: "tmmintrin.h", 4: "smmintrin.h", 5: "ammintrin.h"}[SSE]
+$assert MR <= 4
#include <assert.h>
$if SSE == 5:
@@ -83,7 +84,7 @@
$if K == 0:
const __m128i vb${K}${K+1} = _mm_loadu_si128((const __m128i*) w);
$else:
- const __m128i vb${K}${K+1} = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + ${K * 8}));
+ const __m128i vb${K}${K+1} = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + ${K * 8} * sizeof(int8_t)));
const __m128i vsb${K}${K+1} = _mm_cmpgt_epi8(_mm_setzero_si128(), vb${K}${K+1});
const __m128i vxb${K} = _mm_unpacklo_epi8(vb${K}${K+1}, vsb${K}${K+1});
const __m128i vxb${K+1} = _mm_unpackhi_epi8(vb${K}${K+1}, vsb${K}${K+1});
@@ -108,7 +109,7 @@
$if K == 0:
const __m128i vb${K} = _mm_loadl_epi64((const __m128i*) w);
$else:
- const __m128i vb${K} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${K * 8}));
+ const __m128i vb${K} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${K * 8} * sizeof(int8_t)));
$if SSE >= 4:
const __m128i vxb${K} = _mm_cvtepi8_epi16(vb${K});
$else:
@@ -122,7 +123,7 @@
vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K}, ${K}, ${K}, ${K})), vxb${K}));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -135,7 +136,7 @@
a${M} = (const int8_t*) ((uintptr_t) a${M} + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
$for M in range(MR):
@@ -148,7 +149,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
$for M in range(MR):
@@ -161,7 +162,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
$for M in range(MR):
@@ -174,7 +175,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
$for M in range(MR):
diff --git a/src/qs8-gemm/MRx4c8-minmax-sse.c.in b/src/qs8-gemm/MRx4c8-minmax-sse.c.in
index 9b98f96..634cdb8 100644
--- a/src/qs8-gemm/MRx4c8-minmax-sse.c.in
+++ b/src/qs8-gemm/MRx4c8-minmax-sse.c.in
@@ -4,6 +4,7 @@
// LICENSE file in the root directory of this source tree.
$SSE_HEADER = {2: "emmintrin.h", 3: "tmmintrin.h", 4: "smmintrin.h", 5: "ammintrin.h"}[SSE]
+$assert MR <= 4
#include <assert.h>
$if SSE == 5:
@@ -85,7 +86,7 @@
$if N == 0:
const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) w);
$else:
- const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) ((uintptr_t) w + ${N * 8}));
+ const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) ((uintptr_t) w + ${N * 8} * sizeof(int8_t)));
const __m128i vsb${N}${N+1} = _mm_cmpgt_epi8(_mm_setzero_si128(), vb${N}${N+1});
const __m128i vxb${N} = _mm_unpacklo_epi8(vb${N}${N+1}, vsb${N}${N+1});
const __m128i vxb${N+1} = _mm_unpackhi_epi8(vb${N}${N+1}, vsb${N}${N+1});
@@ -102,7 +103,7 @@
$if N == 0:
const __m128i vb${N} = _mm_loadl_epi64((const __m128i*) w);
$else:
- const __m128i vb${N} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${N * 8}));
+ const __m128i vb${N} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${N * 8} * sizeof(int8_t)));
$if SSE >= 4:
const __m128i vxb${N} = _mm_cvtepi8_epi16(vb${N});
$else:
@@ -114,7 +115,7 @@
$else:
vacc${M}x${N} = _mm_add_epi32(vacc${M}x${N}, _mm_madd_epi16(vxa${M}, vxb${N}));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
@@ -216,7 +217,7 @@
$for M in range(0, MR, 2):
vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
- $if M > 2:
+ $if MR > 2:
__m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
$else:
__m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
diff --git a/src/qs8-gemm/MRx8c8-minmax-avx2.c.in b/src/qs8-gemm/MRx8c8-minmax-avx2.c.in
new file mode 100644
index 0000000..bdb8fe6
--- /dev/null
+++ b/src/qs8-gemm/MRx8c8-minmax-avx2.c.in
@@ -0,0 +1,210 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert MR <= 4
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_${MR}x8c8__avx2(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const int8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= ${MR});
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const int8_t* a0 = a;
+ int8_t* c0 = c;
+ $for M in range(1, MR):
+ const int8_t* a${M} = (const int8_t*) ((uintptr_t) a${M-1} + a_stride);
+ int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+ $if M % 2 == 0:
+ if XNN_UNPREDICTABLE(mr <= ${M}) {
+ a${M} = a${M-1};
+ c${M} = c${M-1};
+ }
+ $elif M + 1 == MR:
+ if XNN_UNPREDICTABLE(mr != ${M+1}) {
+ a${M} = a${M-1};
+ c${M} = c${M-1};
+ }
+ $else:
+ if XNN_UNPREDICTABLE(mr < ${M+1}) {
+ a${M} = a${M-1};
+ c${M} = c${M-1};
+ }
+
+ do {
+ const __m128i vbias0x0 = _mm_loadu_si32(w);
+ const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+ __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+ $for N in range(2, 8, 2):
+ const __m128i vbias0x${N} = _mm_loadu_si32((const void*) ((uintptr_t) w + ${N} * sizeof(int32_t)));
+ const __m128i vbias0x${N+1} = _mm_loadu_si32((const void*) ((uintptr_t) w + ${N+1} * sizeof(int32_t)));
+ __m256i vacc0x${N}${N+1} = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x${N}), vbias0x${N+1}, 1);
+ $for M in range(1, MR):
+ $for N in range(0, 8, 2):
+ __m256i vacc${M}x${N}${N+1} = vacc0x${N}${N+1};
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+ size_t k = 0;
+ while (k < kc) {
+ $for M in range(MR):
+ const __m128i va${M} = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a${M}));
+ const __m256i vxa${M} = _mm256_cvtepi8_epi16(va${M});
+ a${M} += 8;
+
+ $for N in range(0, 8, 2):
+ $if N == 0:
+ const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) w);
+ $else:
+ const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) ((uintptr_t) w + ${N * 8} * sizeof(int8_t)));
+ const __m256i vxb${N}${N+1} = _mm256_cvtepi8_epi16(vb${N}${N+1});
+
+ $for M in range(MR):
+ vacc${M}x${N}${N+1} = _mm256_add_epi32(vacc${M}x${N}${N+1}, _mm256_madd_epi16(vxa${M}, vxb${N}${N+1}));
+
+ w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+ k += 8 * sizeof(int8_t);
+ }
+
+ $for M in range(MR):
+ const __m256i vacc${M}x0213 = _mm256_hadd_epi32(vacc${M}x01, vacc${M}x23);
+ const __m256i vacc${M}x4657 = _mm256_hadd_epi32(vacc${M}x45, vacc${M}x67);
+
+ $for M in range(MR):
+ const __m256i vacc${M}x02461357 = _mm256_hadd_epi32(vacc${M}x0213, vacc${M}x4657);
+
+ const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+ $for M in range(MR):
+ __m256i vacc${M}x01234567 = _mm256_permutevar8x32_epi32(vacc${M}x02461357, vpermuate_mask);
+
+ const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+ const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+ $for M in range(MR):
+ const __m256i vacc${M}x23016745 = _mm256_shuffle_epi32(vacc${M}x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+ $for M in range(MR):
+ const __m256i vprod${M}x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc${M}x01234567, vmultiplier), vrounding);
+
+ $for M in range(MR):
+ const __m256i vprod${M}x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc${M}x23016745, vmultiplier), vrounding);
+
+ $for M in range(MR):
+ const __m256i vq31prod${M}x0246 = _mm256_srli_epi64(vprod${M}x0246, 31);
+ const __m256i vq31prod${M}x1357 = _mm256_add_epi64(vprod${M}x1357, vprod${M}x1357);
+
+ $for M in range(MR):
+ const __m256i vq31prod${M}x01234567 = _mm256_blend_epi16(vq31prod${M}x0246, vq31prod${M}x1357, 0xCC);
+
+ const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+ $for M in range(MR):
+ const __m256i vrem${M}x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod${M}x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod${M}x01234567));
+
+ const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+ const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+ $for M in range(MR):
+ vacc${M}x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod${M}x01234567, vshift), _mm256_cmpgt_epi32(vrem${M}x01234567, vremainder_threshold));
+
+ const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+ $for M in range(0, MR, 2):
+ __m256i vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc${M}x01234567, vacc${min(M+1, MR-1)}x01234567), voutput_zero_point);
+
+ $for M in range(0, MR, 2):
+ vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_permute4x64_epi64(vacc${M}${min(M+1, MR-1)}x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+ const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+ $for M in range(0, MR, 2):
+ vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc${M}${min(M+1, MR-1)}x01234567, voutput_min), voutput_max);
+
+ $if MR > 2:
+ __m256i vout = _mm256_packs_epi16(vacc0${min(1, MR-1)}x01234567, vacc${min(2, MR-1)}${min(3, MR-1)}x01234567);
+ $else:
+ __m256i vout = _mm256_packs_epi16(vacc0${min(1, MR-1)}x01234567, vacc0${min(1, MR-1)}x01234567);
+ __m128i vout_lo = _mm256_castsi256_si128(vout);
+ __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+ if (nc >= 8) {
+ _mm_storel_epi64((__m128i*) c0, vout_lo);
+ $if MR > 1:
+ _mm_storel_epi64((__m128i*) c1, vout_hi);
+ $if MR > 2:
+ _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
+ $if MR > 3:
+ _mm_storeh_pi((__m64*) c3, _mm_castsi128_ps(vout_hi));
+
+ $for M in range(MR):
+ a${M} = (const int8_t*) ((uintptr_t) a${M} - k);
+
+ $for M in range(MR):
+ c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ _mm_storeu_si32(c0, vout_lo);
+ $if MR > 1:
+ _mm_storeu_si32(c1, vout_hi);
+ $if MR > 2:
+ *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
+ $if MR > 3:
+ *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout_hi, 2);
+
+ $for M in range(MR):
+ c${M} += 4;
+
+ vout_lo = _mm_srli_epi64(vout_lo, 32);
+ vout_hi = _mm_srli_epi64(vout_hi, 32);
+ }
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+ $if MR > 1:
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
+ $if MR > 2:
+ *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
+ $if MR > 3:
+ *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout_hi, 4);
+
+ $for M in range(MR):
+ c${M} += 2;
+
+ vout_lo = _mm_srli_epi32(vout_lo, 16);
+ vout_hi = _mm_srli_epi32(vout_hi, 16);
+ }
+ if (nc & 1) {
+ *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+ $if MR > 1:
+ *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
+ $if MR > 2:
+ *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
+ $if MR > 3:
+ *c3 = (uint8_t) _mm_extract_epi8(vout_hi, 8);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
index 242187e..79d0933 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
@@ -58,7 +58,7 @@
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -69,7 +69,7 @@
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -78,7 +78,7 @@
a0 = (const int8_t*) ((uintptr_t) a0 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -86,7 +86,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -94,7 +94,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -102,7 +102,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
index df743e5..09299c9 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
@@ -53,23 +53,23 @@
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -78,7 +78,7 @@
a0 = (const int8_t*) ((uintptr_t) a0 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -86,7 +86,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -94,7 +94,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -102,7 +102,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
index 196256c..d8aeb5d 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
@@ -58,7 +58,7 @@
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -69,7 +69,7 @@
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -78,7 +78,7 @@
a0 = (const int8_t*) ((uintptr_t) a0 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -86,7 +86,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -94,7 +94,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -102,7 +102,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
index 70a2007..bf725d4 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
@@ -53,23 +53,23 @@
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -78,7 +78,7 @@
a0 = (const int8_t*) ((uintptr_t) a0 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -86,7 +86,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -94,7 +94,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -102,7 +102,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
index 873c750..d06ae9d 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
@@ -58,7 +58,7 @@
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -69,7 +69,7 @@
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -78,7 +78,7 @@
a0 = (const int8_t*) ((uintptr_t) a0 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -86,7 +86,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -94,7 +94,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -102,7 +102,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
index c5a5235..ddb4be1 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
@@ -53,23 +53,23 @@
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -78,7 +78,7 @@
a0 = (const int8_t*) ((uintptr_t) a0 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -86,7 +86,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -94,7 +94,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -102,7 +102,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
index 3e6be1a..b7321ed 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
@@ -63,7 +63,7 @@
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
- const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -74,7 +74,7 @@
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -83,7 +83,7 @@
a0 = (const int8_t*) ((uintptr_t) a0 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_maddd_epi16(
@@ -91,7 +91,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_maddd_epi16(
@@ -99,7 +99,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_maddd_epi16(
@@ -107,7 +107,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_maddd_epi16(
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
index aeeae22..e79fcea 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
@@ -58,23 +58,23 @@
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
vacc0x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -83,7 +83,7 @@
a0 = (const int8_t*) ((uintptr_t) a0 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_maddd_epi16(
@@ -91,7 +91,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_maddd_epi16(
@@ -99,7 +99,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_maddd_epi16(
@@ -107,7 +107,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_maddd_epi16(
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
index 0e33bdf..3bd7613 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
@@ -58,7 +58,7 @@
vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -66,7 +66,7 @@
vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
index 225638a..a7df502 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
@@ -55,20 +55,20 @@
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
index 87e1325..c7daf75 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
@@ -58,7 +58,7 @@
vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -66,7 +66,7 @@
vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
index d04dd53..5e6028b 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
@@ -55,20 +55,20 @@
const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
index e3437f4..dc09f3b 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
@@ -58,7 +58,7 @@
vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -66,7 +66,7 @@
vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
index 4cffc38..6091aef 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
@@ -55,20 +55,20 @@
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
index 194e983..1f257ec 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
@@ -63,7 +63,7 @@
vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
- const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -71,7 +71,7 @@
vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
index 43fa056..3d28582 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
@@ -60,20 +60,20 @@
const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-avx2.c b/src/qs8-gemm/gen/1x8c8-minmax-avx2.c
new file mode 100644
index 0000000..0121281
--- /dev/null
+++ b/src/qs8-gemm/gen/1x8c8-minmax-avx2.c
@@ -0,0 +1,159 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx8c8-minmax-avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const int8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const int8_t* a0 = a;
+ int8_t* c0 = c;
+
+ do {
+ const __m128i vbias0x0 = _mm_loadu_si32(w);
+ const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+ __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+ const __m128i vbias0x2 = _mm_loadu_si32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t)));
+ const __m128i vbias0x3 = _mm_loadu_si32((const void*) ((uintptr_t) w + 3 * sizeof(int32_t)));
+ __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
+ const __m128i vbias0x4 = _mm_loadu_si32((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
+ const __m128i vbias0x5 = _mm_loadu_si32((const void*) ((uintptr_t) w + 5 * sizeof(int32_t)));
+ __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
+ const __m128i vbias0x6 = _mm_loadu_si32((const void*) ((uintptr_t) w + 6 * sizeof(int32_t)));
+ const __m128i vbias0x7 = _mm_loadu_si32((const void*) ((uintptr_t) w + 7 * sizeof(int32_t)));
+ __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+ size_t k = 0;
+ while (k < kc) {
+ const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
+ const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
+ a0 += 8;
+
+ const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+ const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
+
+ vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+ const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
+
+ vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
+ const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t)));
+ const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
+
+ vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
+ const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t)));
+ const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
+
+ vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
+
+ w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+ k += 8 * sizeof(int8_t);
+ }
+
+ const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
+ const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
+
+ const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
+
+ const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+ __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermuate_mask);
+
+ const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+ const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+ const __m256i vacc0x23016745 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+ const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
+
+ const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x23016745, vmultiplier), vrounding);
+
+ const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
+ const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
+
+ const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
+
+ const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+ const __m256i vrem0x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
+
+ const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+ const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+ vacc0x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
+
+ const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+ __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
+
+ vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+ const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+ vacc00x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc00x01234567, voutput_min), voutput_max);
+
+ __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
+ __m128i vout_lo = _mm256_castsi256_si128(vout);
+ __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+ if (nc >= 8) {
+ _mm_storel_epi64((__m128i*) c0, vout_lo);
+
+ a0 = (const int8_t*) ((uintptr_t) a0 - k);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ _mm_storeu_si32(c0, vout_lo);
+
+ c0 += 4;
+
+ vout_lo = _mm_srli_epi64(vout_lo, 32);
+ vout_hi = _mm_srli_epi64(vout_hi, 32);
+ }
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+
+ c0 += 2;
+
+ vout_lo = _mm_srli_epi32(vout_lo, 16);
+ vout_hi = _mm_srli_epi32(vout_hi, 16);
+ }
+ if (nc & 1) {
+ *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
index 7d3bc56..890cc8f 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
@@ -73,7 +73,7 @@
vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
- const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -83,7 +83,7 @@
vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
index d44732a..2d95f19 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
@@ -69,23 +69,23 @@
vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
index 420be81..ee21467 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
@@ -73,7 +73,7 @@
vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
- const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -83,7 +83,7 @@
vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
index af42b60..ccb0cbe 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
@@ -69,23 +69,23 @@
vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
index 8b9843a..5fe28ca 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
@@ -73,7 +73,7 @@
vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
- const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -83,7 +83,7 @@
vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
index eba6930..5f0cf21 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
@@ -69,23 +69,23 @@
vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
index a98794c..1eae140 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
@@ -78,7 +78,7 @@
vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
- const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -88,7 +88,7 @@
vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
index f2f7b66..5850deb 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
@@ -74,23 +74,23 @@
vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k += 8 * sizeof(int8_t);
}
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-avx2.c b/src/qs8-gemm/gen/2x8c8-minmax-avx2.c
new file mode 100644
index 0000000..5bcba2b
--- /dev/null
+++ b/src/qs8-gemm/gen/2x8c8-minmax-avx2.c
@@ -0,0 +1,198 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx8c8-minmax-avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const int8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const int8_t* a0 = a;
+ int8_t* c0 = c;
+ const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+
+ do {
+ const __m128i vbias0x0 = _mm_loadu_si32(w);
+ const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+ __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+ const __m128i vbias0x2 = _mm_loadu_si32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t)));
+ const __m128i vbias0x3 = _mm_loadu_si32((const void*) ((uintptr_t) w + 3 * sizeof(int32_t)));
+ __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
+ const __m128i vbias0x4 = _mm_loadu_si32((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
+ const __m128i vbias0x5 = _mm_loadu_si32((const void*) ((uintptr_t) w + 5 * sizeof(int32_t)));
+ __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
+ const __m128i vbias0x6 = _mm_loadu_si32((const void*) ((uintptr_t) w + 6 * sizeof(int32_t)));
+ const __m128i vbias0x7 = _mm_loadu_si32((const void*) ((uintptr_t) w + 7 * sizeof(int32_t)));
+ __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
+ __m256i vacc1x01 = vacc0x01;
+ __m256i vacc1x23 = vacc0x23;
+ __m256i vacc1x45 = vacc0x45;
+ __m256i vacc1x67 = vacc0x67;
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+ size_t k = 0;
+ while (k < kc) {
+ const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
+ const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
+ a0 += 8;
+ const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
+ const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
+ a1 += 8;
+
+ const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+ const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
+
+ vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
+ vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+ const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
+
+ vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
+ vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
+ const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t)));
+ const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
+
+ vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
+ vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
+ const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t)));
+ const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
+
+ vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
+ vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
+
+ w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+ k += 8 * sizeof(int8_t);
+ }
+
+ const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
+ const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
+ const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
+ const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
+
+ const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
+ const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
+
+ const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+ __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermuate_mask);
+ __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermuate_mask);
+
+ const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+ const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+ const __m256i vacc0x23016745 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m256i vacc1x23016745 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+ const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
+ const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
+
+ const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x23016745, vmultiplier), vrounding);
+ const __m256i vprod1x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x23016745, vmultiplier), vrounding);
+
+ const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
+ const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
+ const __m256i vq31prod1x0246 = _mm256_srli_epi64(vprod1x0246, 31);
+ const __m256i vq31prod1x1357 = _mm256_add_epi64(vprod1x1357, vprod1x1357);
+
+ const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
+ const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
+
+ const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+ const __m256i vrem0x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
+ const __m256i vrem1x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
+
+ const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+ const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+ vacc0x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
+ vacc1x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
+
+ const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+ __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
+
+ vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+ const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+ vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
+
+ __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc01x01234567);
+ __m128i vout_lo = _mm256_castsi256_si128(vout);
+ __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+ if (nc >= 8) {
+ _mm_storel_epi64((__m128i*) c0, vout_lo);
+ _mm_storel_epi64((__m128i*) c1, vout_hi);
+
+ a0 = (const int8_t*) ((uintptr_t) a0 - k);
+ a1 = (const int8_t*) ((uintptr_t) a1 - k);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ _mm_storeu_si32(c0, vout_lo);
+ _mm_storeu_si32(c1, vout_hi);
+
+ c0 += 4;
+ c1 += 4;
+
+ vout_lo = _mm_srli_epi64(vout_lo, 32);
+ vout_hi = _mm_srli_epi64(vout_hi, 32);
+ }
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
+
+ c0 += 2;
+ c1 += 2;
+
+ vout_lo = _mm_srli_epi32(vout_lo, 16);
+ vout_hi = _mm_srli_epi32(vout_hi, 16);
+ }
+ if (nc & 1) {
+ *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+ *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x8c8-minmax-avx2.c b/src/qs8-gemm/gen/3x8c8-minmax-avx2.c
new file mode 100644
index 0000000..4f81cbe
--- /dev/null
+++ b/src/qs8-gemm/gen/3x8c8-minmax-avx2.c
@@ -0,0 +1,240 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-gemm/MRx8c8-minmax-avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ const int8_t* restrict a,
+ size_t a_stride,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 3);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(kc % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ const int8_t* a0 = a;
+ int8_t* c0 = c;
+ const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ a1 = a0;
+ c1 = c0;
+ }
+ const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ a2 = a1;
+ c2 = c1;
+ }
+
+ do {
+ const __m128i vbias0x0 = _mm_loadu_si32(w);
+ const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+ __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+ const __m128i vbias0x2 = _mm_loadu_si32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t)));
+ const __m128i vbias0x3 = _mm_loadu_si32((const void*) ((uintptr_t) w + 3 * sizeof(int32_t)));
+ __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
+ const __m128i vbias0x4 = _mm_loadu_si32((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
+ const __m128i vbias0x5 = _mm_loadu_si32((const void*) ((uintptr_t) w + 5 * sizeof(int32_t)));
+ __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
+ const __m128i vbias0x6 = _mm_loadu_si32((const void*) ((uintptr_t) w + 6 * sizeof(int32_t)));
+ const __m128i vbias0x7 = _mm_loadu_si32((const void*) ((uintptr_t) w + 7 * sizeof(int32_t)));
+ __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
+ __m256i vacc1x01 = vacc0x01;
+ __m256i vacc1x23 = vacc0x23;
+ __m256i vacc1x45 = vacc0x45;
+ __m256i vacc1x67 = vacc0x67;
+ __m256i vacc2x01 = vacc0x01;
+ __m256i vacc2x23 = vacc0x23;
+ __m256i vacc2x45 = vacc0x45;
+ __m256i vacc2x67 = vacc0x67;
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+ size_t k = 0;
+ while (k < kc) {
+ const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
+ const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
+ a0 += 8;
+ const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
+ const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
+ a1 += 8;
+ const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
+ const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
+ a2 += 8;
+
+ const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+ const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
+
+ vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
+ vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
+ vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+ const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
+
+ vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
+ vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
+ vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
+ const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t)));
+ const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
+
+ vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
+ vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
+ vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
+ const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t)));
+ const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
+
+ vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
+ vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
+ vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
+
+ w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+ k += 8 * sizeof(int8_t);
+ }
+
+ const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
+ const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
+ const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
+ const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
+ const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
+ const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
+
+ const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
+ const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
+ const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
+
+ const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+ __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermuate_mask);
+ __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermuate_mask);
+ __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermuate_mask);
+
+ const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+ const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+ const __m256i vacc0x23016745 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m256i vacc1x23016745 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m256i vacc2x23016745 = _mm256_shuffle_epi32(vacc2x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+ const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
+ const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
+ const __m256i vprod2x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc2x01234567, vmultiplier), vrounding);
+
+ const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x23016745, vmultiplier), vrounding);
+ const __m256i vprod1x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x23016745, vmultiplier), vrounding);
+ const __m256i vprod2x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc2x23016745, vmultiplier), vrounding);
+
+ const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
+ const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
+ const __m256i vq31prod1x0246 = _mm256_srli_epi64(vprod1x0246, 31);
+ const __m256i vq31prod1x1357 = _mm256_add_epi64(vprod1x1357, vprod1x1357);
+ const __m256i vq31prod2x0246 = _mm256_srli_epi64(vprod2x0246, 31);
+ const __m256i vq31prod2x1357 = _mm256_add_epi64(vprod2x1357, vprod2x1357);
+
+ const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
+ const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
+ const __m256i vq31prod2x01234567 = _mm256_blend_epi16(vq31prod2x0246, vq31prod2x1357, 0xCC);
+
+ const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+ const __m256i vrem0x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
+ const __m256i vrem1x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
+ const __m256i vrem2x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod2x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod2x01234567));
+
+ const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+ const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+ vacc0x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
+ vacc1x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
+ vacc2x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod2x01234567, vshift), _mm256_cmpgt_epi32(vrem2x01234567, vremainder_threshold));
+
+ const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+ __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
+ __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
+
+ vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+ vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+ const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+ vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
+ vacc22x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc22x01234567, voutput_min), voutput_max);
+
+ __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
+ __m128i vout_lo = _mm256_castsi256_si128(vout);
+ __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+ if (nc >= 8) {
+ _mm_storel_epi64((__m128i*) c0, vout_lo);
+ _mm_storel_epi64((__m128i*) c1, vout_hi);
+ _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
+
+ a0 = (const int8_t*) ((uintptr_t) a0 - k);
+ a1 = (const int8_t*) ((uintptr_t) a1 - k);
+ a2 = (const int8_t*) ((uintptr_t) a2 - k);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ _mm_storeu_si32(c0, vout_lo);
+ _mm_storeu_si32(c1, vout_hi);
+ *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
+
+ c0 += 4;
+ c1 += 4;
+ c2 += 4;
+
+ vout_lo = _mm_srli_epi64(vout_lo, 32);
+ vout_hi = _mm_srli_epi64(vout_hi, 32);
+ }
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
+ *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
+
+ c0 += 2;
+ c1 += 2;
+ c2 += 2;
+
+ vout_lo = _mm_srli_epi32(vout_lo, 16);
+ vout_hi = _mm_srli_epi32(vout_hi, 16);
+ }
+ if (nc & 1) {
+ *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+ *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
+ *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
index 2f17e7f..824fc16 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
@@ -100,7 +100,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -123,7 +123,7 @@
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -141,7 +141,7 @@
a3 = (const int8_t*) ((uintptr_t) a3 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -155,7 +155,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -169,7 +169,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -183,7 +183,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
index 0682df8..d21c2c3 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
@@ -89,7 +89,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -100,7 +100,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -111,7 +111,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -123,7 +123,7 @@
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -141,7 +141,7 @@
a3 = (const int8_t*) ((uintptr_t) a3 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -155,7 +155,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -169,7 +169,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -183,7 +183,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
index e04bd6a..41f08c3 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
@@ -100,7 +100,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -123,7 +123,7 @@
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -141,7 +141,7 @@
a3 = (const int8_t*) ((uintptr_t) a3 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -155,7 +155,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -169,7 +169,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -183,7 +183,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
index af6875a..988afd6 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
@@ -89,7 +89,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -100,7 +100,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -111,7 +111,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -123,7 +123,7 @@
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -141,7 +141,7 @@
a3 = (const int8_t*) ((uintptr_t) a3 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -155,7 +155,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -169,7 +169,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -183,7 +183,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
index 0a49c93..d98801f 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
@@ -100,7 +100,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -123,7 +123,7 @@
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -141,7 +141,7 @@
a3 = (const int8_t*) ((uintptr_t) a3 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -155,7 +155,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -169,7 +169,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -183,7 +183,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
index 2790798..1dfaa19 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
@@ -89,7 +89,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -100,7 +100,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -111,7 +111,7 @@
_mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -123,7 +123,7 @@
vacc3x0123 = _mm_add_epi32(vacc3x0123,
_mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -141,7 +141,7 @@
a3 = (const int8_t*) ((uintptr_t) a3 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -155,7 +155,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -169,7 +169,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -183,7 +183,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
index 7fb3c42..947f6d9 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
@@ -105,7 +105,7 @@
_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
vacc3x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
- const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -128,7 +128,7 @@
vacc3x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -146,7 +146,7 @@
a3 = (const int8_t*) ((uintptr_t) a3 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_maddd_epi16(
@@ -160,7 +160,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_maddd_epi16(
@@ -174,7 +174,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_maddd_epi16(
@@ -188,7 +188,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_maddd_epi16(
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
index aa71c9f..f5f654f 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
@@ -94,7 +94,7 @@
_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
vacc3x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123);
- const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+ const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
vacc0x0123 = _mm_maddd_epi16(
@@ -105,7 +105,7 @@
_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
vacc3x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
- const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+ const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
vacc0x0123 = _mm_maddd_epi16(
@@ -116,7 +116,7 @@
_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
vacc3x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
- const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+ const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
vacc0x0123 = _mm_maddd_epi16(
@@ -128,7 +128,7 @@
vacc3x0123 = _mm_maddd_epi16(
_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
- w = (const void*) ((uintptr_t) w + 32);
+ w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
k -= 8 * sizeof(int8_t);
}
if (k != 0) {
@@ -146,7 +146,7 @@
a3 = (const int8_t*) ((uintptr_t) a3 + k);
const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
vacc0x0123 = _mm_maddd_epi16(
@@ -160,7 +160,7 @@
if (k > 2 * sizeof(int8_t)) {
const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
vacc0x0123 = _mm_maddd_epi16(
@@ -174,7 +174,7 @@
if (k > 4 * sizeof(int8_t)) {
const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
vacc0x0123 = _mm_maddd_epi16(
@@ -188,7 +188,7 @@
if (k > 6 * sizeof(int8_t)) {
const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
- w = (const void*) ((uintptr_t) w + 8);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
vacc0x0123 = _mm_maddd_epi16(
diff --git a/src/qs8-igemm/MRx4c2-minmax-sse.c.in b/src/qs8-igemm/MRx4c2-minmax-sse.c.in
index 028a64a..563f78f 100644
--- a/src/qs8-igemm/MRx4c2-minmax-sse.c.in
+++ b/src/qs8-igemm/MRx4c2-minmax-sse.c.in
@@ -4,6 +4,7 @@
// LICENSE file in the root directory of this source tree.
$SSE_HEADER = {2: "emmintrin.h", 3: "tmmintrin.h", 4: "smmintrin.h", 5: "ammintrin.h"}[SSE]
+$assert MR <= 4
#include <assert.h>
$if SSE == 5:
diff --git a/src/qs8-igemm/MRx4c8-minmax-sse.c.in b/src/qs8-igemm/MRx4c8-minmax-sse.c.in
index 18edf63..b512894 100644
--- a/src/qs8-igemm/MRx4c8-minmax-sse.c.in
+++ b/src/qs8-igemm/MRx4c8-minmax-sse.c.in
@@ -4,6 +4,7 @@
// LICENSE file in the root directory of this source tree.
$SSE_HEADER = {2: "emmintrin.h", 3: "tmmintrin.h", 4: "smmintrin.h", 5: "ammintrin.h"}[SSE]
+$assert MR <= 4
#include <assert.h>
$if SSE == 5:
@@ -38,7 +39,9 @@
assert(mr <= ${MR});
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (${MR} * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
@@ -224,7 +227,7 @@
$for M in range(0, MR, 2):
vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
- $if M > 2:
+ $if MR > 2:
__m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
$else:
__m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
diff --git a/src/qs8-igemm/MRx8c8-minmax-avx2.c.in b/src/qs8-igemm/MRx8c8-minmax-avx2.c.in
new file mode 100644
index 0000000..9cd6d70
--- /dev/null
+++ b/src/qs8-igemm/MRx8c8-minmax-avx2.c.in
@@ -0,0 +1,219 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert MR <= 4
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_${MR}x8c8__avx2(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= ${MR});
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (${MR} * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ $for M in range(1, MR):
+ int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+ $if M % 2 == 0:
+ if XNN_UNPREDICTABLE(mr <= ${M}) {
+ c${M} = c${M-1};
+ }
+ $elif M + 1 == MR:
+ if XNN_UNPREDICTABLE(mr != ${M+1}) {
+ c${M} = c${M-1};
+ }
+ $else:
+ if XNN_UNPREDICTABLE(mr < ${M+1}) {
+ c${M} = c${M-1};
+ }
+
+ do {
+ const __m128i vbias0x0 = _mm_loadu_si32(w);
+ const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+ __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+ $for N in range(2, 8, 2):
+ const __m128i vbias0x${N} = _mm_loadu_si32((const void*) ((uintptr_t) w + ${N} * sizeof(int32_t)));
+ const __m128i vbias0x${N+1} = _mm_loadu_si32((const void*) ((uintptr_t) w + ${N+1} * sizeof(int32_t)));
+ __m256i vacc0x${N}${N+1} = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x${N}), vbias0x${N+1}, 1);
+ $for M in range(1, MR):
+ $for N in range(0, 8, 2):
+ __m256i vacc${M}x${N}${N+1} = vacc0x${N}${N+1};
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+ size_t p = ks;
+ do {
+ $for M in range(MR):
+ const int8_t* restrict a${M} = a[${M}];
+ if XNN_UNPREDICTABLE(a${M} != zero) {
+ a${M} = (const int8_t*) ((uintptr_t) a${M} + a_offset);
+ }
+ a += ${MR};
+
+ size_t k = 0;
+ while (k < kc) {
+ $for M in range(MR):
+ const __m128i va${M} = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a${M}));
+ const __m256i vxa${M} = _mm256_cvtepi8_epi16(va${M});
+ a${M} += 8;
+
+ $for N in range(0, 8, 2):
+ $if N == 0:
+ const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) w);
+ $else:
+ const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) ((uintptr_t) w + ${N * 8} * sizeof(int8_t)));
+ const __m256i vxb${N}${N+1} = _mm256_cvtepi8_epi16(vb${N}${N+1});
+
+ $for M in range(MR):
+ vacc${M}x${N}${N+1} = _mm256_add_epi32(vacc${M}x${N}${N+1}, _mm256_madd_epi16(vxa${M}, vxb${N}${N+1}));
+
+ w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+ k += 8 * sizeof(int8_t);
+ }
+ p -= ${MR} * sizeof(void*);
+ } while (p != 0);
+
+ $for M in range(MR):
+ const __m256i vacc${M}x0213 = _mm256_hadd_epi32(vacc${M}x01, vacc${M}x23);
+ const __m256i vacc${M}x4657 = _mm256_hadd_epi32(vacc${M}x45, vacc${M}x67);
+
+ $for M in range(MR):
+ const __m256i vacc${M}x02461357 = _mm256_hadd_epi32(vacc${M}x0213, vacc${M}x4657);
+
+ const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+ $for M in range(MR):
+ __m256i vacc${M}x01234567 = _mm256_permutevar8x32_epi32(vacc${M}x02461357, vpermuate_mask);
+
+ const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+ const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+ $for M in range(MR):
+ const __m256i vacc${M}x23016745 = _mm256_shuffle_epi32(vacc${M}x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+ $for M in range(MR):
+ const __m256i vprod${M}x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc${M}x01234567, vmultiplier), vrounding);
+
+ $for M in range(MR):
+ const __m256i vprod${M}x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc${M}x23016745, vmultiplier), vrounding);
+
+ $for M in range(MR):
+ const __m256i vq31prod${M}x0246 = _mm256_srli_epi64(vprod${M}x0246, 31);
+ const __m256i vq31prod${M}x1357 = _mm256_add_epi64(vprod${M}x1357, vprod${M}x1357);
+
+ $for M in range(MR):
+ const __m256i vq31prod${M}x01234567 = _mm256_blend_epi16(vq31prod${M}x0246, vq31prod${M}x1357, 0xCC);
+
+ const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+ $for M in range(MR):
+ const __m256i vrem${M}x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod${M}x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod${M}x01234567));
+
+ const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+ const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+ $for M in range(MR):
+ vacc${M}x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod${M}x01234567, vshift), _mm256_cmpgt_epi32(vrem${M}x01234567, vremainder_threshold));
+
+ const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+ $for M in range(0, MR, 2):
+ __m256i vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc${M}x01234567, vacc${min(M+1, MR-1)}x01234567), voutput_zero_point);
+
+ $for M in range(0, MR, 2):
+ vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_permute4x64_epi64(vacc${M}${min(M+1, MR-1)}x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+ const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+ $for M in range(0, MR, 2):
+ vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc${M}${min(M+1, MR-1)}x01234567, voutput_min), voutput_max);
+
+ $if MR > 2:
+ __m256i vout = _mm256_packs_epi16(vacc0${min(1, MR-1)}x01234567, vacc${min(2, MR-1)}${min(3, MR-1)}x01234567);
+ $else:
+ __m256i vout = _mm256_packs_epi16(vacc0${min(1, MR-1)}x01234567, vacc0${min(1, MR-1)}x01234567);
+ __m128i vout_lo = _mm256_castsi256_si128(vout);
+ __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+ if (nc >= 8) {
+ $if MR > 3:
+ _mm_storeh_pi((__m64*) c3, _mm_castsi128_ps(vout_hi));
+ $if MR > 2:
+ _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
+ $if MR > 1:
+ _mm_storel_epi64((__m128i*) c1, vout_hi);
+ _mm_storel_epi64((__m128i*) c0, vout_lo);
+
+ $for M in reversed(range(MR)):
+ c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ $if MR > 3:
+ *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout_hi, 2);
+ $if MR > 2:
+ *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
+ $if MR > 1:
+ _mm_storeu_si32(c1, vout_hi);
+ _mm_storeu_si32(c0, vout_lo);
+
+ $for M in reversed(range(MR)):
+ c${M} += 4;
+
+ vout_lo = _mm_srli_epi64(vout_lo, 32);
+ vout_hi = _mm_srli_epi64(vout_hi, 32);
+ }
+ if (nc & 2) {
+ $if MR > 3:
+ *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout_hi, 4);
+ $if MR > 2:
+ *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
+ $if MR > 1:
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+
+ $for M in reversed(range(MR)):
+ c${M} += 2;
+
+ vout_lo = _mm_srli_epi32(vout_lo, 16);
+ vout_hi = _mm_srli_epi32(vout_hi, 16);
+ }
+ if (nc & 1) {
+ $if MR > 3:
+ *c3 = (uint8_t) _mm_extract_epi8(vout_hi, 8);
+ $if MR > 2:
+ *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
+ $if MR > 1:
+ *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
index 8d944c1..ebd336f 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
@@ -32,7 +32,9 @@
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
index d2e0d7c..2c0b506 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
@@ -32,7 +32,9 @@
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
index e2d9b1a..0cc2ea1 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
@@ -32,7 +32,9 @@
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
index 436269c..7361bf5 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
@@ -32,7 +32,9 @@
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
index 3e18267..66122ee 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
@@ -32,7 +32,9 @@
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
index dab68bf..393becc 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
@@ -32,7 +32,9 @@
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
index 7f76f76..8f78a11 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
@@ -37,7 +37,9 @@
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
index 8163689..8d4fd13 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
@@ -37,7 +37,9 @@
assert(mr <= 1);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-avx2.c b/src/qs8-igemm/gen/1x8c8-minmax-avx2.c
new file mode 100644
index 0000000..fe48dce
--- /dev/null
+++ b/src/qs8-igemm/gen/1x8c8-minmax-avx2.c
@@ -0,0 +1,172 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/MRx8c8-minmax-avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 1);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (1 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+
+ do {
+ const __m128i vbias0x0 = _mm_loadu_si32(w);
+ const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+ __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+ const __m128i vbias0x2 = _mm_loadu_si32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t)));
+ const __m128i vbias0x3 = _mm_loadu_si32((const void*) ((uintptr_t) w + 3 * sizeof(int32_t)));
+ __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
+ const __m128i vbias0x4 = _mm_loadu_si32((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
+ const __m128i vbias0x5 = _mm_loadu_si32((const void*) ((uintptr_t) w + 5 * sizeof(int32_t)));
+ __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
+ const __m128i vbias0x6 = _mm_loadu_si32((const void*) ((uintptr_t) w + 6 * sizeof(int32_t)));
+ const __m128i vbias0x7 = _mm_loadu_si32((const void*) ((uintptr_t) w + 7 * sizeof(int32_t)));
+ __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ a += 1;
+
+ size_t k = 0;
+ while (k < kc) {
+ const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
+ const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
+ a0 += 8;
+
+ const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+ const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
+
+ vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+ const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
+
+ vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
+ const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t)));
+ const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
+
+ vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
+ const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t)));
+ const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
+
+ vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
+
+ w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+ k += 8 * sizeof(int8_t);
+ }
+ p -= 1 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
+ const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
+
+ const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
+
+ const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+ __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermuate_mask);
+
+ const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+ const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+ const __m256i vacc0x23016745 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+ const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
+
+ const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x23016745, vmultiplier), vrounding);
+
+ const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
+ const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
+
+ const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
+
+ const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+ const __m256i vrem0x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
+
+ const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+ const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+ vacc0x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
+
+ const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+ __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
+
+ vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+ const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+ vacc00x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc00x01234567, voutput_min), voutput_max);
+
+ __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
+ __m128i vout_lo = _mm256_castsi256_si128(vout);
+ __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+ if (nc >= 8) {
+ _mm_storel_epi64((__m128i*) c0, vout_lo);
+
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ _mm_storeu_si32(c0, vout_lo);
+
+ c0 += 4;
+
+ vout_lo = _mm_srli_epi64(vout_lo, 32);
+ vout_hi = _mm_srli_epi64(vout_hi, 32);
+ }
+ if (nc & 2) {
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+
+ c0 += 2;
+
+ vout_lo = _mm_srli_epi32(vout_lo, 16);
+ vout_hi = _mm_srli_epi32(vout_hi, 16);
+ }
+ if (nc & 1) {
+ *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
index a339331..7db2361 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
@@ -32,7 +32,9 @@
assert(mr <= 2);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
index 1d66dc9..becb16b 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
@@ -32,7 +32,9 @@
assert(mr <= 2);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
index 24f3ffe..c63c8ae 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
@@ -32,7 +32,9 @@
assert(mr <= 2);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
index 2dee200..b7fd902 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
@@ -32,7 +32,9 @@
assert(mr <= 2);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
index d26977b..247616b 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
@@ -32,7 +32,9 @@
assert(mr <= 2);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
index e04b9af..fdf04cd 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
@@ -32,7 +32,9 @@
assert(mr <= 2);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
index 8f052c9..7f5a4f5 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
@@ -37,7 +37,9 @@
assert(mr <= 2);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
index 6b7a9b7..74812a7 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
@@ -37,7 +37,9 @@
assert(mr <= 2);
assert(nc != 0);
assert(kc != 0);
- assert(kc % sizeof(int8_t) == 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-avx2.c b/src/qs8-igemm/gen/2x8c8-minmax-avx2.c
new file mode 100644
index 0000000..2a1cffa
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8c8-minmax-avx2.c
@@ -0,0 +1,212 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/MRx8c8-minmax-avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 2);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (2 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr != 2) {
+ c1 = c0;
+ }
+
+ do {
+ const __m128i vbias0x0 = _mm_loadu_si32(w);
+ const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+ __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+ const __m128i vbias0x2 = _mm_loadu_si32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t)));
+ const __m128i vbias0x3 = _mm_loadu_si32((const void*) ((uintptr_t) w + 3 * sizeof(int32_t)));
+ __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
+ const __m128i vbias0x4 = _mm_loadu_si32((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
+ const __m128i vbias0x5 = _mm_loadu_si32((const void*) ((uintptr_t) w + 5 * sizeof(int32_t)));
+ __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
+ const __m128i vbias0x6 = _mm_loadu_si32((const void*) ((uintptr_t) w + 6 * sizeof(int32_t)));
+ const __m128i vbias0x7 = _mm_loadu_si32((const void*) ((uintptr_t) w + 7 * sizeof(int32_t)));
+ __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
+ __m256i vacc1x01 = vacc0x01;
+ __m256i vacc1x23 = vacc0x23;
+ __m256i vacc1x45 = vacc0x45;
+ __m256i vacc1x67 = vacc0x67;
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ a += 2;
+
+ size_t k = 0;
+ while (k < kc) {
+ const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
+ const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
+ a0 += 8;
+ const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
+ const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
+ a1 += 8;
+
+ const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+ const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
+
+ vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
+ vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+ const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
+
+ vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
+ vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
+ const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t)));
+ const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
+
+ vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
+ vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
+ const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t)));
+ const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
+
+ vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
+ vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
+
+ w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+ k += 8 * sizeof(int8_t);
+ }
+ p -= 2 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
+ const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
+ const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
+ const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
+
+ const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
+ const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
+
+ const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+ __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermuate_mask);
+ __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermuate_mask);
+
+ const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+ const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+ const __m256i vacc0x23016745 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m256i vacc1x23016745 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+ const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
+ const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
+
+ const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x23016745, vmultiplier), vrounding);
+ const __m256i vprod1x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x23016745, vmultiplier), vrounding);
+
+ const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
+ const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
+ const __m256i vq31prod1x0246 = _mm256_srli_epi64(vprod1x0246, 31);
+ const __m256i vq31prod1x1357 = _mm256_add_epi64(vprod1x1357, vprod1x1357);
+
+ const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
+ const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
+
+ const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+ const __m256i vrem0x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
+ const __m256i vrem1x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
+
+ const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+ const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+ vacc0x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
+ vacc1x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
+
+ const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+ __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
+
+ vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+ const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+ vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
+
+ __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc01x01234567);
+ __m128i vout_lo = _mm256_castsi256_si128(vout);
+ __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+ if (nc >= 8) {
+ _mm_storel_epi64((__m128i*) c1, vout_hi);
+ _mm_storel_epi64((__m128i*) c0, vout_lo);
+
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ _mm_storeu_si32(c1, vout_hi);
+ _mm_storeu_si32(c0, vout_lo);
+
+ c1 += 4;
+ c0 += 4;
+
+ vout_lo = _mm_srli_epi64(vout_lo, 32);
+ vout_hi = _mm_srli_epi64(vout_hi, 32);
+ }
+ if (nc & 2) {
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+
+ c1 += 2;
+ c0 += 2;
+
+ vout_lo = _mm_srli_epi32(vout_lo, 16);
+ vout_hi = _mm_srli_epi32(vout_hi, 16);
+ }
+ if (nc & 1) {
+ *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x8c8-minmax-avx2.c b/src/qs8-igemm/gen/3x8c8-minmax-avx2.c
new file mode 100644
index 0000000..7cce630
--- /dev/null
+++ b/src/qs8-igemm/gen/3x8c8-minmax-avx2.c
@@ -0,0 +1,255 @@
+// Auto-generated file. Do not edit!
+// Template: src/qs8-igemm/MRx8c8-minmax-avx2.c.in
+// Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2(
+ size_t mr,
+ size_t nc,
+ size_t kc,
+ size_t ks,
+ const int8_t** restrict a,
+ const void* restrict w,
+ int8_t* restrict c,
+ size_t cm_stride,
+ size_t cn_stride,
+ size_t a_offset,
+ const int8_t* zero,
+ const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+ assert(mr != 0);
+ assert(mr <= 3);
+ assert(nc != 0);
+ assert(kc != 0);
+ assert(ks != 0);
+ assert(ks % (3 * sizeof(void*)) == 0);
+ assert(a_offset % sizeof(int8_t) == 0);
+ assert(a != NULL);
+ assert(w != NULL);
+ assert(c != NULL);
+
+ int8_t* c0 = c;
+ int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+ if XNN_UNPREDICTABLE(mr < 2) {
+ c1 = c0;
+ }
+ int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+ if XNN_UNPREDICTABLE(mr <= 2) {
+ c2 = c1;
+ }
+
+ do {
+ const __m128i vbias0x0 = _mm_loadu_si32(w);
+ const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+ __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+ const __m128i vbias0x2 = _mm_loadu_si32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t)));
+ const __m128i vbias0x3 = _mm_loadu_si32((const void*) ((uintptr_t) w + 3 * sizeof(int32_t)));
+ __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
+ const __m128i vbias0x4 = _mm_loadu_si32((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
+ const __m128i vbias0x5 = _mm_loadu_si32((const void*) ((uintptr_t) w + 5 * sizeof(int32_t)));
+ __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
+ const __m128i vbias0x6 = _mm_loadu_si32((const void*) ((uintptr_t) w + 6 * sizeof(int32_t)));
+ const __m128i vbias0x7 = _mm_loadu_si32((const void*) ((uintptr_t) w + 7 * sizeof(int32_t)));
+ __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
+ __m256i vacc1x01 = vacc0x01;
+ __m256i vacc1x23 = vacc0x23;
+ __m256i vacc1x45 = vacc0x45;
+ __m256i vacc1x67 = vacc0x67;
+ __m256i vacc2x01 = vacc0x01;
+ __m256i vacc2x23 = vacc0x23;
+ __m256i vacc2x45 = vacc0x45;
+ __m256i vacc2x67 = vacc0x67;
+ w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+ size_t p = ks;
+ do {
+ const int8_t* restrict a0 = a[0];
+ if XNN_UNPREDICTABLE(a0 != zero) {
+ a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+ }
+ const int8_t* restrict a1 = a[1];
+ if XNN_UNPREDICTABLE(a1 != zero) {
+ a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+ }
+ const int8_t* restrict a2 = a[2];
+ if XNN_UNPREDICTABLE(a2 != zero) {
+ a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+ }
+ a += 3;
+
+ size_t k = 0;
+ while (k < kc) {
+ const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
+ const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
+ a0 += 8;
+ const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
+ const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
+ a1 += 8;
+ const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
+ const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
+ a2 += 8;
+
+ const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+ const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
+
+ vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
+ vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
+ vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
+ const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+ const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
+
+ vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
+ vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
+ vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
+ const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t)));
+ const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
+
+ vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
+ vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
+ vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
+ const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t)));
+ const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
+
+ vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
+ vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
+ vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
+
+ w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+ k += 8 * sizeof(int8_t);
+ }
+ p -= 3 * sizeof(void*);
+ } while (p != 0);
+
+ const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
+ const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
+ const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
+ const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
+ const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
+ const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
+
+ const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
+ const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
+ const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
+
+ const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+ __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermuate_mask);
+ __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermuate_mask);
+ __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermuate_mask);
+
+ const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+ const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+ const __m256i vacc0x23016745 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m256i vacc1x23016745 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+ const __m256i vacc2x23016745 = _mm256_shuffle_epi32(vacc2x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+ const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
+ const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
+ const __m256i vprod2x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc2x01234567, vmultiplier), vrounding);
+
+ const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x23016745, vmultiplier), vrounding);
+ const __m256i vprod1x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x23016745, vmultiplier), vrounding);
+ const __m256i vprod2x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc2x23016745, vmultiplier), vrounding);
+
+ const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
+ const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
+ const __m256i vq31prod1x0246 = _mm256_srli_epi64(vprod1x0246, 31);
+ const __m256i vq31prod1x1357 = _mm256_add_epi64(vprod1x1357, vprod1x1357);
+ const __m256i vq31prod2x0246 = _mm256_srli_epi64(vprod2x0246, 31);
+ const __m256i vq31prod2x1357 = _mm256_add_epi64(vprod2x1357, vprod2x1357);
+
+ const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
+ const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
+ const __m256i vq31prod2x01234567 = _mm256_blend_epi16(vq31prod2x0246, vq31prod2x1357, 0xCC);
+
+ const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+ const __m256i vrem0x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
+ const __m256i vrem1x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
+ const __m256i vrem2x01234567 =
+ _mm256_add_epi32(_mm256_and_si256(vq31prod2x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod2x01234567));
+
+ const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+ const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+ vacc0x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
+ vacc1x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
+ vacc2x01234567 =
+ _mm256_sub_epi32(_mm256_sra_epi32(vq31prod2x01234567, vshift), _mm256_cmpgt_epi32(vrem2x01234567, vremainder_threshold));
+
+ const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+ __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
+ __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
+
+ vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+ vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+ const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+ const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+ vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
+ vacc22x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc22x01234567, voutput_min), voutput_max);
+
+ __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
+ __m128i vout_lo = _mm256_castsi256_si128(vout);
+ __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+ if (nc >= 8) {
+ _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
+ _mm_storel_epi64((__m128i*) c1, vout_hi);
+ _mm_storel_epi64((__m128i*) c0, vout_lo);
+
+ c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+ c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+ c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+ a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+ nc -= 8;
+ } else {
+ if (nc & 4) {
+ *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
+ _mm_storeu_si32(c1, vout_hi);
+ _mm_storeu_si32(c0, vout_lo);
+
+ c2 += 4;
+ c1 += 4;
+ c0 += 4;
+
+ vout_lo = _mm_srli_epi64(vout_lo, 32);
+ vout_hi = _mm_srli_epi64(vout_hi, 32);
+ }
+ if (nc & 2) {
+ *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
+ *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
+ *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+
+ c2 += 2;
+ c1 += 2;
+ c0 += 2;
+
+ vout_lo = _mm_srli_epi32(vout_lo, 16);
+ vout_hi = _mm_srli_epi32(vout_hi, 16);
+ }
+ if (nc & 1) {
+ *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
+ *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
+ *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+ }
+
+ nc = 0;
+ }
+ } while (nc != 0);
+}
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index f4a876d..ecd0b7d 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -549,6 +549,10 @@
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128)
DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2)
+
#ifdef __cplusplus
} // extern "C"
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 590fcae..cf262ee 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -364,6 +364,10 @@
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128)
DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2)
+
#ifdef __cplusplus
} // extern "C"