AVX2 version of QS8 GEMM and IGEMM microkernels

PiperOrigin-RevId: 324543693
diff --git a/src/qs8-gemm/MRx4c2-minmax-sse.c.in b/src/qs8-gemm/MRx4c2-minmax-sse.c.in
index a6bced4..3af144f 100644
--- a/src/qs8-gemm/MRx4c2-minmax-sse.c.in
+++ b/src/qs8-gemm/MRx4c2-minmax-sse.c.in
@@ -4,6 +4,7 @@
 // LICENSE file in the root directory of this source tree.
 
 $SSE_HEADER = {2: "emmintrin.h", 3: "tmmintrin.h", 4: "smmintrin.h", 5: "ammintrin.h"}[SSE]
+$assert MR <= 4
 #include <assert.h>
 
 $if SSE == 5:
@@ -83,7 +84,7 @@
           $if K == 0:
             const __m128i vb${K}${K+1} = _mm_loadu_si128((const __m128i*) w);
           $else:
-            const __m128i vb${K}${K+1} = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + ${K * 8}));
+            const __m128i vb${K}${K+1} = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + ${K * 8} * sizeof(int8_t)));
           const __m128i vsb${K}${K+1} = _mm_cmpgt_epi8(_mm_setzero_si128(), vb${K}${K+1});
           const __m128i vxb${K} = _mm_unpacklo_epi8(vb${K}${K+1}, vsb${K}${K+1});
           const __m128i vxb${K+1} = _mm_unpackhi_epi8(vb${K}${K+1}, vsb${K}${K+1});
@@ -108,7 +109,7 @@
           $if K == 0:
             const __m128i vb${K} = _mm_loadl_epi64((const __m128i*) w);
           $else:
-            const __m128i vb${K} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${K * 8}));
+            const __m128i vb${K} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${K * 8} * sizeof(int8_t)));
           $if SSE >= 4:
             const __m128i vxb${K} = _mm_cvtepi8_epi16(vb${K});
           $else:
@@ -122,7 +123,7 @@
               vacc${M}x0123 = _mm_add_epi32(vacc${M}x0123,
                 _mm_madd_epi16(_mm_shuffle_epi32(vxa${M}, _MM_SHUFFLE(${K}, ${K}, ${K}, ${K})), vxb${K}));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -135,7 +136,7 @@
         a${M} = (const int8_t*) ((uintptr_t) a${M} + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       $for M in range(MR):
@@ -148,7 +149,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         $for M in range(MR):
@@ -161,7 +162,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           $for M in range(MR):
@@ -174,7 +175,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             $for M in range(MR):
diff --git a/src/qs8-gemm/MRx4c8-minmax-sse.c.in b/src/qs8-gemm/MRx4c8-minmax-sse.c.in
index 9b98f96..634cdb8 100644
--- a/src/qs8-gemm/MRx4c8-minmax-sse.c.in
+++ b/src/qs8-gemm/MRx4c8-minmax-sse.c.in
@@ -4,6 +4,7 @@
 // LICENSE file in the root directory of this source tree.
 
 $SSE_HEADER = {2: "emmintrin.h", 3: "tmmintrin.h", 4: "smmintrin.h", 5: "ammintrin.h"}[SSE]
+$assert MR <= 4
 #include <assert.h>
 
 $if SSE == 5:
@@ -85,7 +86,7 @@
           $if N == 0:
             const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) w);
           $else:
-            const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) ((uintptr_t) w + ${N * 8}));
+            const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) ((uintptr_t) w + ${N * 8} * sizeof(int8_t)));
           const __m128i vsb${N}${N+1} = _mm_cmpgt_epi8(_mm_setzero_si128(), vb${N}${N+1});
           const __m128i vxb${N} = _mm_unpacklo_epi8(vb${N}${N+1}, vsb${N}${N+1});
           const __m128i vxb${N+1} = _mm_unpackhi_epi8(vb${N}${N+1}, vsb${N}${N+1});
@@ -102,7 +103,7 @@
           $if N == 0:
             const __m128i vb${N} = _mm_loadl_epi64((const __m128i*) w);
           $else:
-            const __m128i vb${N} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${N * 8}));
+            const __m128i vb${N} = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + ${N * 8} * sizeof(int8_t)));
           $if SSE >= 4:
             const __m128i vxb${N} = _mm_cvtepi8_epi16(vb${N});
           $else:
@@ -114,7 +115,7 @@
             $else:
               vacc${M}x${N} = _mm_add_epi32(vacc${M}x${N}, _mm_madd_epi16(vxa${M}, vxb${N}));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
@@ -216,7 +217,7 @@
     $for M in range(0, MR, 2):
       vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
 
-    $if M > 2:
+    $if MR > 2:
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
     $else:
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
diff --git a/src/qs8-gemm/MRx8c8-minmax-avx2.c.in b/src/qs8-gemm/MRx8c8-minmax-avx2.c.in
new file mode 100644
index 0000000..bdb8fe6
--- /dev/null
+++ b/src/qs8-gemm/MRx8c8-minmax-avx2.c.in
@@ -0,0 +1,210 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert MR <= 4
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_${MR}x8c8__avx2(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  $for M in range(1, MR):
+    const int8_t* a${M} = (const int8_t*) ((uintptr_t) a${M-1} + a_stride);
+    int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        a${M} = a${M-1};
+        c${M} = c${M-1};
+      }
+
+  do {
+    const __m128i vbias0x0 = _mm_loadu_si32(w);
+    const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+    __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+    $for N in range(2, 8, 2):
+      const __m128i vbias0x${N} = _mm_loadu_si32((const void*) ((uintptr_t) w + ${N} * sizeof(int32_t)));
+      const __m128i vbias0x${N+1} = _mm_loadu_si32((const void*) ((uintptr_t) w + ${N+1} * sizeof(int32_t)));
+      __m256i vacc0x${N}${N+1} = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x${N}), vbias0x${N+1}, 1);
+    $for M in range(1, MR):
+      $for N in range(0, 8, 2):
+        __m256i vacc${M}x${N}${N+1} = vacc0x${N}${N+1};
+    w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      $for M in range(MR):
+        const __m128i va${M} = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a${M}));
+        const __m256i vxa${M} = _mm256_cvtepi8_epi16(va${M});
+        a${M} += 8;
+
+      $for N in range(0, 8, 2):
+        $if N == 0:
+          const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) w);
+        $else:
+          const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) ((uintptr_t) w + ${N * 8} * sizeof(int8_t)));
+        const __m256i vxb${N}${N+1} = _mm256_cvtepi8_epi16(vb${N}${N+1});
+
+        $for M in range(MR):
+          vacc${M}x${N}${N+1} = _mm256_add_epi32(vacc${M}x${N}${N+1}, _mm256_madd_epi16(vxa${M}, vxb${N}${N+1}));
+
+      w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    $for M in range(MR):
+      const __m256i vacc${M}x0213 = _mm256_hadd_epi32(vacc${M}x01, vacc${M}x23);
+      const __m256i vacc${M}x4657 = _mm256_hadd_epi32(vacc${M}x45, vacc${M}x67);
+
+    $for M in range(MR):
+      const __m256i vacc${M}x02461357 = _mm256_hadd_epi32(vacc${M}x0213, vacc${M}x4657);
+
+    const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    $for M in range(MR):
+      __m256i vacc${M}x01234567 = _mm256_permutevar8x32_epi32(vacc${M}x02461357, vpermuate_mask);
+
+    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+    $for M in range(MR):
+      const __m256i vacc${M}x23016745 = _mm256_shuffle_epi32(vacc${M}x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+    $for M in range(MR):
+      const __m256i vprod${M}x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc${M}x01234567, vmultiplier), vrounding);
+
+    $for M in range(MR):
+      const __m256i vprod${M}x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc${M}x23016745, vmultiplier), vrounding);
+
+    $for M in range(MR):
+      const __m256i vq31prod${M}x0246 = _mm256_srli_epi64(vprod${M}x0246, 31);
+      const __m256i vq31prod${M}x1357 = _mm256_add_epi64(vprod${M}x1357, vprod${M}x1357);
+
+    $for M in range(MR):
+      const __m256i vq31prod${M}x01234567 = _mm256_blend_epi16(vq31prod${M}x0246, vq31prod${M}x1357, 0xCC);
+
+    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    $for M in range(MR):
+      const __m256i vrem${M}x01234567 =
+        _mm256_add_epi32(_mm256_and_si256(vq31prod${M}x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod${M}x01234567));
+
+    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    $for M in range(MR):
+      vacc${M}x01234567 =
+        _mm256_sub_epi32(_mm256_sra_epi32(vq31prod${M}x01234567, vshift), _mm256_cmpgt_epi32(vrem${M}x01234567, vremainder_threshold));
+
+    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    $for M in range(0, MR, 2):
+      __m256i vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc${M}x01234567, vacc${min(M+1, MR-1)}x01234567), voutput_zero_point);
+
+    $for M in range(0, MR, 2):
+      vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_permute4x64_epi64(vacc${M}${min(M+1, MR-1)}x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+    $for M in range(0, MR, 2):
+      vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc${M}${min(M+1, MR-1)}x01234567, voutput_min), voutput_max);
+
+    $if MR > 2:
+      __m256i vout = _mm256_packs_epi16(vacc0${min(1, MR-1)}x01234567, vacc${min(2, MR-1)}${min(3, MR-1)}x01234567);
+    $else:
+      __m256i vout = _mm256_packs_epi16(vacc0${min(1, MR-1)}x01234567, vacc0${min(1, MR-1)}x01234567);
+    __m128i vout_lo = _mm256_castsi256_si128(vout);
+    __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+    if (nc >= 8) {
+      _mm_storel_epi64((__m128i*) c0, vout_lo);
+      $if MR > 1:
+        _mm_storel_epi64((__m128i*) c1, vout_hi);
+      $if MR > 2:
+        _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
+      $if MR > 3:
+        _mm_storeh_pi((__m64*) c3, _mm_castsi128_ps(vout_hi));
+
+      $for M in range(MR):
+        a${M} = (const int8_t*) ((uintptr_t) a${M} - k);
+
+      $for M in range(MR):
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        _mm_storeu_si32(c0, vout_lo);
+        $if MR > 1:
+          _mm_storeu_si32(c1, vout_hi);
+        $if MR > 2:
+          *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
+        $if MR > 3:
+          *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout_hi, 2);
+
+        $for M in range(MR):
+          c${M} += 4;
+
+        vout_lo = _mm_srli_epi64(vout_lo, 32);
+        vout_hi = _mm_srli_epi64(vout_hi, 32);
+      }
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+        $if MR > 1:
+          *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
+        $if MR > 2:
+          *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
+        $if MR > 3:
+          *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout_hi, 4);
+
+        $for M in range(MR):
+          c${M} += 2;
+
+        vout_lo = _mm_srli_epi32(vout_lo, 16);
+        vout_hi = _mm_srli_epi32(vout_hi, 16);
+      }
+      if (nc & 1) {
+        *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+        $if MR > 1:
+          *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
+        $if MR > 2:
+          *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
+        $if MR > 3:
+          *c3 = (uint8_t) _mm_extract_epi8(vout_hi, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
index 242187e..79d0933 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld128.c
@@ -58,7 +58,7 @@
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -69,7 +69,7 @@
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -78,7 +78,7 @@
       a0 = (const int8_t*) ((uintptr_t) a0 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -86,7 +86,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -94,7 +94,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -102,7 +102,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
index df743e5..09299c9 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse2-ld64.c
@@ -53,23 +53,23 @@
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -78,7 +78,7 @@
       a0 = (const int8_t*) ((uintptr_t) a0 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -86,7 +86,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -94,7 +94,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -102,7 +102,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
index 196256c..d8aeb5d 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld128.c
@@ -58,7 +58,7 @@
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -69,7 +69,7 @@
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -78,7 +78,7 @@
       a0 = (const int8_t*) ((uintptr_t) a0 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -86,7 +86,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -94,7 +94,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -102,7 +102,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
index 70a2007..bf725d4 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-sse41-ld64.c
@@ -53,23 +53,23 @@
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -78,7 +78,7 @@
       a0 = (const int8_t*) ((uintptr_t) a0 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -86,7 +86,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -94,7 +94,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -102,7 +102,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
index 873c750..d06ae9d 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld128.c
@@ -58,7 +58,7 @@
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -69,7 +69,7 @@
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -78,7 +78,7 @@
       a0 = (const int8_t*) ((uintptr_t) a0 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -86,7 +86,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -94,7 +94,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -102,7 +102,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
index c5a5235..ddb4be1 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-ssse3-ld64.c
@@ -53,23 +53,23 @@
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -78,7 +78,7 @@
       a0 = (const int8_t*) ((uintptr_t) a0 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -86,7 +86,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -94,7 +94,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -102,7 +102,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
index 3e6be1a..b7321ed 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld128.c
@@ -63,7 +63,7 @@
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
-      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -74,7 +74,7 @@
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -83,7 +83,7 @@
       a0 = (const int8_t*) ((uintptr_t) a0 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_maddd_epi16(
@@ -91,7 +91,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_maddd_epi16(
@@ -99,7 +99,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_maddd_epi16(
@@ -107,7 +107,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_maddd_epi16(
diff --git a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
index aeeae22..e79fcea 100644
--- a/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c2-minmax-xop-ld64.c
@@ -58,23 +58,23 @@
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc0x0123);
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc0x0123);
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc0x0123);
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
 
       vacc0x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa0, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc0x0123);
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -83,7 +83,7 @@
       a0 = (const int8_t*) ((uintptr_t) a0 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_maddd_epi16(
@@ -91,7 +91,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_maddd_epi16(
@@ -99,7 +99,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_maddd_epi16(
@@ -107,7 +107,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_maddd_epi16(
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
index 0e33bdf..3bd7613 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld128.c
@@ -58,7 +58,7 @@
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -66,7 +66,7 @@
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
index 225638a..a7df502 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse2-ld64.c
@@ -55,20 +55,20 @@
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
index 87e1325..c7daf75 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld128.c
@@ -58,7 +58,7 @@
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -66,7 +66,7 @@
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
index d04dd53..5e6028b 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-sse41-ld64.c
@@ -55,20 +55,20 @@
       const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
 
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
index e3437f4..dc09f3b 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld128.c
@@ -58,7 +58,7 @@
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -66,7 +66,7 @@
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
index 4cffc38..6091aef 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-ssse3-ld64.c
@@ -55,20 +55,20 @@
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
index 194e983..1f257ec 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld128.c
@@ -63,7 +63,7 @@
 
       vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
       vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
-      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -71,7 +71,7 @@
       vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
       vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
index 43fa056..3d28582 100644
--- a/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/1x4c8-minmax-xop-ld64.c
@@ -60,20 +60,20 @@
       const __m128i vxb0 = _mm_cvtepi8_epi16(vb0);
 
       vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
       vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
       vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
 
       vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/1x8c8-minmax-avx2.c b/src/qs8-gemm/gen/1x8c8-minmax-avx2.c
new file mode 100644
index 0000000..0121281
--- /dev/null
+++ b/src/qs8-gemm/gen/1x8c8-minmax-avx2.c
@@ -0,0 +1,159 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx8c8-minmax-avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+
+  do {
+    const __m128i vbias0x0 = _mm_loadu_si32(w);
+    const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+    __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+    const __m128i vbias0x2 = _mm_loadu_si32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t)));
+    const __m128i vbias0x3 = _mm_loadu_si32((const void*) ((uintptr_t) w + 3 * sizeof(int32_t)));
+    __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
+    const __m128i vbias0x4 = _mm_loadu_si32((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
+    const __m128i vbias0x5 = _mm_loadu_si32((const void*) ((uintptr_t) w + 5 * sizeof(int32_t)));
+    __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
+    const __m128i vbias0x6 = _mm_loadu_si32((const void*) ((uintptr_t) w + 6 * sizeof(int32_t)));
+    const __m128i vbias0x7 = _mm_loadu_si32((const void*) ((uintptr_t) w + 7 * sizeof(int32_t)));
+    __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
+    w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
+      const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
+      a0 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
+
+      vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
+
+      vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
+      const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t)));
+      const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
+
+      vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
+      const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t)));
+      const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
+
+      vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
+
+      w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
+    const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
+
+    const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
+
+    const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermuate_mask);
+
+    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+    const __m256i vacc0x23016745 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+    const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
+
+    const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x23016745, vmultiplier), vrounding);
+
+    const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
+    const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
+
+    const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
+
+    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vrem0x01234567 =
+      _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
+
+    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    vacc0x01234567 =
+      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
+
+    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
+
+    vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+    vacc00x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc00x01234567, voutput_min), voutput_max);
+
+    __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
+    __m128i vout_lo = _mm256_castsi256_si128(vout);
+    __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+    if (nc >= 8) {
+      _mm_storel_epi64((__m128i*) c0, vout_lo);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - k);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        _mm_storeu_si32(c0, vout_lo);
+
+        c0 += 4;
+
+        vout_lo = _mm_srli_epi64(vout_lo, 32);
+        vout_hi = _mm_srli_epi64(vout_hi, 32);
+      }
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+
+        c0 += 2;
+
+        vout_lo = _mm_srli_epi32(vout_lo, 16);
+        vout_hi = _mm_srli_epi32(vout_hi, 16);
+      }
+      if (nc & 1) {
+        *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
index 7d3bc56..890cc8f 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld128.c
@@ -73,7 +73,7 @@
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
-      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -83,7 +83,7 @@
       vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
       vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
index d44732a..2d95f19 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse2-ld64.c
@@ -69,23 +69,23 @@
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
       vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
index 420be81..ee21467 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld128.c
@@ -73,7 +73,7 @@
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
-      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -83,7 +83,7 @@
       vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
       vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
index af42b60..ccb0cbe 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-sse41-ld64.c
@@ -69,23 +69,23 @@
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
 
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
       vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
index 8b9843a..5fe28ca 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld128.c
@@ -73,7 +73,7 @@
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
-      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -83,7 +83,7 @@
       vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
       vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
index eba6930..5f0cf21 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-ssse3-ld64.c
@@ -69,23 +69,23 @@
 
       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
       vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
       vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
index a98794c..1eae140 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld128.c
@@ -78,7 +78,7 @@
       vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
       vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
       vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
-      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -88,7 +88,7 @@
       vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
       vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
index f2f7b66..5850deb 100644
--- a/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/2x4c8-minmax-xop-ld64.c
@@ -74,23 +74,23 @@
 
       vacc0x0 = _mm_maddd_epi16(vxa0, vxb0, vacc0x0);
       vacc1x0 = _mm_maddd_epi16(vxa1, vxb0, vacc1x0);
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
       vacc0x1 = _mm_maddd_epi16(vxa0, vxb1, vacc0x1);
       vacc1x1 = _mm_maddd_epi16(vxa1, vxb1, vacc1x1);
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
       vacc0x2 = _mm_maddd_epi16(vxa0, vxb2, vacc0x2);
       vacc1x2 = _mm_maddd_epi16(vxa1, vxb2, vacc1x2);
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
 
       vacc0x3 = _mm_maddd_epi16(vxa0, vxb3, vacc0x3);
       vacc1x3 = _mm_maddd_epi16(vxa1, vxb3, vacc1x3);
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k += 8 * sizeof(int8_t);
     }
 
diff --git a/src/qs8-gemm/gen/2x8c8-minmax-avx2.c b/src/qs8-gemm/gen/2x8c8-minmax-avx2.c
new file mode 100644
index 0000000..5bcba2b
--- /dev/null
+++ b/src/qs8-gemm/gen/2x8c8-minmax-avx2.c
@@ -0,0 +1,198 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx8c8-minmax-avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+
+  do {
+    const __m128i vbias0x0 = _mm_loadu_si32(w);
+    const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+    __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+    const __m128i vbias0x2 = _mm_loadu_si32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t)));
+    const __m128i vbias0x3 = _mm_loadu_si32((const void*) ((uintptr_t) w + 3 * sizeof(int32_t)));
+    __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
+    const __m128i vbias0x4 = _mm_loadu_si32((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
+    const __m128i vbias0x5 = _mm_loadu_si32((const void*) ((uintptr_t) w + 5 * sizeof(int32_t)));
+    __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
+    const __m128i vbias0x6 = _mm_loadu_si32((const void*) ((uintptr_t) w + 6 * sizeof(int32_t)));
+    const __m128i vbias0x7 = _mm_loadu_si32((const void*) ((uintptr_t) w + 7 * sizeof(int32_t)));
+    __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
+    __m256i vacc1x01 = vacc0x01;
+    __m256i vacc1x23 = vacc0x23;
+    __m256i vacc1x45 = vacc0x45;
+    __m256i vacc1x67 = vacc0x67;
+    w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
+      const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
+      const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
+      a1 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
+
+      vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
+      vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
+
+      vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
+      vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
+      const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t)));
+      const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
+
+      vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
+      vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
+      const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t)));
+      const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
+
+      vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
+      vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
+
+      w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
+    const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
+    const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
+    const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
+
+    const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
+    const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
+
+    const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermuate_mask);
+    __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermuate_mask);
+
+    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+    const __m256i vacc0x23016745 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m256i vacc1x23016745 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+    const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
+    const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
+
+    const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x23016745, vmultiplier), vrounding);
+    const __m256i vprod1x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x23016745, vmultiplier), vrounding);
+
+    const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
+    const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
+    const __m256i vq31prod1x0246 = _mm256_srli_epi64(vprod1x0246, 31);
+    const __m256i vq31prod1x1357 = _mm256_add_epi64(vprod1x1357, vprod1x1357);
+
+    const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
+    const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
+
+    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vrem0x01234567 =
+      _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
+    const __m256i vrem1x01234567 =
+      _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
+
+    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    vacc0x01234567 =
+      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
+    vacc1x01234567 =
+      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
+
+    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
+
+    vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+    vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
+
+    __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc01x01234567);
+    __m128i vout_lo = _mm256_castsi256_si128(vout);
+    __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+    if (nc >= 8) {
+      _mm_storel_epi64((__m128i*) c0, vout_lo);
+      _mm_storel_epi64((__m128i*) c1, vout_hi);
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - k);
+      a1 = (const int8_t*) ((uintptr_t) a1 - k);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        _mm_storeu_si32(c0, vout_lo);
+        _mm_storeu_si32(c1, vout_hi);
+
+        c0 += 4;
+        c1 += 4;
+
+        vout_lo = _mm_srli_epi64(vout_lo, 32);
+        vout_hi = _mm_srli_epi64(vout_hi, 32);
+      }
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
+
+        c0 += 2;
+        c1 += 2;
+
+        vout_lo = _mm_srli_epi32(vout_lo, 16);
+        vout_hi = _mm_srli_epi32(vout_hi, 16);
+      }
+      if (nc & 1) {
+        *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+        *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/3x8c8-minmax-avx2.c b/src/qs8-gemm/gen/3x8c8-minmax-avx2.c
new file mode 100644
index 0000000..4f81cbe
--- /dev/null
+++ b/src/qs8-gemm/gen/3x8c8-minmax-avx2.c
@@ -0,0 +1,240 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-gemm/MRx8c8-minmax-avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/gemm.h>
+
+
+void xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    const int8_t* restrict a,
+    size_t a_stride,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(kc % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  const int8_t* a0 = a;
+  int8_t* c0 = c;
+  const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    a1 = a0;
+    c1 = c0;
+  }
+  const int8_t* a2 = (const int8_t*) ((uintptr_t) a1 + a_stride);
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    a2 = a1;
+    c2 = c1;
+  }
+
+  do {
+    const __m128i vbias0x0 = _mm_loadu_si32(w);
+    const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+    __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+    const __m128i vbias0x2 = _mm_loadu_si32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t)));
+    const __m128i vbias0x3 = _mm_loadu_si32((const void*) ((uintptr_t) w + 3 * sizeof(int32_t)));
+    __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
+    const __m128i vbias0x4 = _mm_loadu_si32((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
+    const __m128i vbias0x5 = _mm_loadu_si32((const void*) ((uintptr_t) w + 5 * sizeof(int32_t)));
+    __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
+    const __m128i vbias0x6 = _mm_loadu_si32((const void*) ((uintptr_t) w + 6 * sizeof(int32_t)));
+    const __m128i vbias0x7 = _mm_loadu_si32((const void*) ((uintptr_t) w + 7 * sizeof(int32_t)));
+    __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
+    __m256i vacc1x01 = vacc0x01;
+    __m256i vacc1x23 = vacc0x23;
+    __m256i vacc1x45 = vacc0x45;
+    __m256i vacc1x67 = vacc0x67;
+    __m256i vacc2x01 = vacc0x01;
+    __m256i vacc2x23 = vacc0x23;
+    __m256i vacc2x45 = vacc0x45;
+    __m256i vacc2x67 = vacc0x67;
+    w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+    size_t k = 0;
+    while (k < kc) {
+      const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
+      const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
+      a0 += 8;
+      const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
+      const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
+      a1 += 8;
+      const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
+      const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
+      a2 += 8;
+
+      const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+      const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
+
+      vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
+      vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
+      vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
+      const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+      const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
+
+      vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
+      vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
+      vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
+      const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t)));
+      const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
+
+      vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
+      vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
+      vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
+      const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t)));
+      const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
+
+      vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
+      vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
+      vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
+
+      w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+      k += 8 * sizeof(int8_t);
+    }
+
+    const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
+    const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
+    const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
+    const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
+    const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
+    const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
+
+    const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
+    const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
+    const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
+
+    const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermuate_mask);
+    __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermuate_mask);
+    __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermuate_mask);
+
+    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+    const __m256i vacc0x23016745 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m256i vacc1x23016745 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m256i vacc2x23016745 = _mm256_shuffle_epi32(vacc2x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+    const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
+    const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
+    const __m256i vprod2x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc2x01234567, vmultiplier), vrounding);
+
+    const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x23016745, vmultiplier), vrounding);
+    const __m256i vprod1x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x23016745, vmultiplier), vrounding);
+    const __m256i vprod2x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc2x23016745, vmultiplier), vrounding);
+
+    const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
+    const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
+    const __m256i vq31prod1x0246 = _mm256_srli_epi64(vprod1x0246, 31);
+    const __m256i vq31prod1x1357 = _mm256_add_epi64(vprod1x1357, vprod1x1357);
+    const __m256i vq31prod2x0246 = _mm256_srli_epi64(vprod2x0246, 31);
+    const __m256i vq31prod2x1357 = _mm256_add_epi64(vprod2x1357, vprod2x1357);
+
+    const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
+    const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
+    const __m256i vq31prod2x01234567 = _mm256_blend_epi16(vq31prod2x0246, vq31prod2x1357, 0xCC);
+
+    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vrem0x01234567 =
+      _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
+    const __m256i vrem1x01234567 =
+      _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
+    const __m256i vrem2x01234567 =
+      _mm256_add_epi32(_mm256_and_si256(vq31prod2x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod2x01234567));
+
+    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    vacc0x01234567 =
+      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
+    vacc1x01234567 =
+      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
+    vacc2x01234567 =
+      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod2x01234567, vshift), _mm256_cmpgt_epi32(vrem2x01234567, vremainder_threshold));
+
+    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
+    __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
+
+    vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+    vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+    vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
+    vacc22x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc22x01234567, voutput_min), voutput_max);
+
+    __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
+    __m128i vout_lo = _mm256_castsi256_si128(vout);
+    __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+    if (nc >= 8) {
+      _mm_storel_epi64((__m128i*) c0, vout_lo);
+      _mm_storel_epi64((__m128i*) c1, vout_hi);
+      _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
+
+      a0 = (const int8_t*) ((uintptr_t) a0 - k);
+      a1 = (const int8_t*) ((uintptr_t) a1 - k);
+      a2 = (const int8_t*) ((uintptr_t) a2 - k);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        _mm_storeu_si32(c0, vout_lo);
+        _mm_storeu_si32(c1, vout_hi);
+        *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
+
+        c0 += 4;
+        c1 += 4;
+        c2 += 4;
+
+        vout_lo = _mm_srli_epi64(vout_lo, 32);
+        vout_hi = _mm_srli_epi64(vout_hi, 32);
+      }
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
+
+        c0 += 2;
+        c1 += 2;
+        c2 += 2;
+
+        vout_lo = _mm_srli_epi32(vout_lo, 16);
+        vout_hi = _mm_srli_epi32(vout_hi, 16);
+      }
+      if (nc & 1) {
+        *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+        *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
+        *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
index 2f17e7f..824fc16 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld128.c
@@ -100,7 +100,7 @@
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -123,7 +123,7 @@
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -141,7 +141,7 @@
       a3 = (const int8_t*) ((uintptr_t) a3 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -155,7 +155,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -169,7 +169,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -183,7 +183,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
index 0682df8..d21c2c3 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse2-ld64.c
@@ -89,7 +89,7 @@
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -100,7 +100,7 @@
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -111,7 +111,7 @@
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -123,7 +123,7 @@
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -141,7 +141,7 @@
       a3 = (const int8_t*) ((uintptr_t) a3 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -155,7 +155,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -169,7 +169,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -183,7 +183,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
index e04bd6a..41f08c3 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld128.c
@@ -100,7 +100,7 @@
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -123,7 +123,7 @@
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -141,7 +141,7 @@
       a3 = (const int8_t*) ((uintptr_t) a3 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -155,7 +155,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -169,7 +169,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -183,7 +183,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
index af6875a..988afd6 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-sse41-ld64.c
@@ -89,7 +89,7 @@
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -100,7 +100,7 @@
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -111,7 +111,7 @@
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -123,7 +123,7 @@
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -141,7 +141,7 @@
       a3 = (const int8_t*) ((uintptr_t) a3 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -155,7 +155,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -169,7 +169,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -183,7 +183,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
index 0a49c93..d98801f 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld128.c
@@ -100,7 +100,7 @@
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -123,7 +123,7 @@
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -141,7 +141,7 @@
       a3 = (const int8_t*) ((uintptr_t) a3 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -155,7 +155,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -169,7 +169,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -183,7 +183,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
index 2790798..1dfaa19 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-ssse3-ld64.c
@@ -89,7 +89,7 @@
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0));
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -100,7 +100,7 @@
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1));
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -111,7 +111,7 @@
         _mm_madd_epi16(_mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2));
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -123,7 +123,7 @@
       vacc3x0123 = _mm_add_epi32(vacc3x0123,
         _mm_madd_epi16(_mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3));
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -141,7 +141,7 @@
       a3 = (const int8_t*) ((uintptr_t) a3 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -155,7 +155,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -169,7 +169,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_add_epi32(vacc0x0123,
@@ -183,7 +183,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_add_epi32(vacc0x0123,
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
index 7fb3c42..947f6d9 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld128.c
@@ -105,7 +105,7 @@
         _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
       vacc3x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
-      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb23 = _mm_loadu_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vsb23 = _mm_cmpgt_epi8(_mm_setzero_si128(), vb23);
       const __m128i vxb2 = _mm_unpacklo_epi8(vb23, vsb23);
       const __m128i vxb3 = _mm_unpackhi_epi8(vb23, vsb23);
@@ -128,7 +128,7 @@
       vacc3x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -146,7 +146,7 @@
       a3 = (const int8_t*) ((uintptr_t) a3 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_maddd_epi16(
@@ -160,7 +160,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_maddd_epi16(
@@ -174,7 +174,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_maddd_epi16(
@@ -188,7 +188,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_maddd_epi16(
diff --git a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
index aa71c9f..f5f654f 100644
--- a/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
+++ b/src/qs8-gemm/gen/4x4c2-minmax-xop-ld64.c
@@ -94,7 +94,7 @@
         _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc2x0123);
       vacc3x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(0, 0, 0, 0)), vxb0, vacc3x0123);
-      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8));
+      const __m128i vb1 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 8 * sizeof(int8_t)));
       const __m128i vxb1 = _mm_cvtepi8_epi16(vb1);
 
       vacc0x0123 = _mm_maddd_epi16(
@@ -105,7 +105,7 @@
         _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc2x0123);
       vacc3x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(1, 1, 1, 1)), vxb1, vacc3x0123);
-      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16));
+      const __m128i vb2 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
       const __m128i vxb2 = _mm_cvtepi8_epi16(vb2);
 
       vacc0x0123 = _mm_maddd_epi16(
@@ -116,7 +116,7 @@
         _mm_shuffle_epi32(vxa2, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc2x0123);
       vacc3x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(2, 2, 2, 2)), vxb2, vacc3x0123);
-      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24));
+      const __m128i vb3 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 24 * sizeof(int8_t)));
       const __m128i vxb3 = _mm_cvtepi8_epi16(vb3);
 
       vacc0x0123 = _mm_maddd_epi16(
@@ -128,7 +128,7 @@
       vacc3x0123 = _mm_maddd_epi16(
         _mm_shuffle_epi32(vxa3, _MM_SHUFFLE(3, 3, 3, 3)), vxb3, vacc3x0123);
 
-      w = (const void*) ((uintptr_t) w + 32);
+      w = (const void*) ((uintptr_t) w + 32 * sizeof(int8_t));
       k -= 8 * sizeof(int8_t);
     }
     if (k != 0) {
@@ -146,7 +146,7 @@
       a3 = (const int8_t*) ((uintptr_t) a3 + k);
 
       const __m128i vb0 = _mm_loadl_epi64((const __m128i*) w);
-      w = (const void*) ((uintptr_t) w + 8);
+      w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
       const __m128i vxb0 = _mm_unpacklo_epi8(vb0, _mm_cmpgt_epi8(_mm_setzero_si128(), vb0));
 
       vacc0x0123 = _mm_maddd_epi16(
@@ -160,7 +160,7 @@
 
       if (k > 2 * sizeof(int8_t)) {
         const __m128i vb1 = _mm_loadl_epi64((const __m128i*) w);
-        w = (const void*) ((uintptr_t) w + 8);
+        w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
         const __m128i vxb1 = _mm_unpacklo_epi8(vb1, _mm_cmpgt_epi8(_mm_setzero_si128(), vb1));
 
         vacc0x0123 = _mm_maddd_epi16(
@@ -174,7 +174,7 @@
 
         if (k > 4 * sizeof(int8_t)) {
           const __m128i vb2 = _mm_loadl_epi64((const __m128i*) w);
-          w = (const void*) ((uintptr_t) w + 8);
+          w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
           const __m128i vxb2 = _mm_unpacklo_epi8(vb2, _mm_cmpgt_epi8(_mm_setzero_si128(), vb2));
 
           vacc0x0123 = _mm_maddd_epi16(
@@ -188,7 +188,7 @@
 
           if (k > 6 * sizeof(int8_t)) {
             const __m128i vb3 = _mm_loadl_epi64((const __m128i*) w);
-            w = (const void*) ((uintptr_t) w + 8);
+            w = (const void*) ((uintptr_t) w + 8 * sizeof(int8_t));
             const __m128i vxb3 = _mm_unpacklo_epi8(vb3, _mm_cmpgt_epi8(_mm_setzero_si128(), vb3));
 
             vacc0x0123 = _mm_maddd_epi16(
diff --git a/src/qs8-igemm/MRx4c2-minmax-sse.c.in b/src/qs8-igemm/MRx4c2-minmax-sse.c.in
index 028a64a..563f78f 100644
--- a/src/qs8-igemm/MRx4c2-minmax-sse.c.in
+++ b/src/qs8-igemm/MRx4c2-minmax-sse.c.in
@@ -4,6 +4,7 @@
 // LICENSE file in the root directory of this source tree.
 
 $SSE_HEADER = {2: "emmintrin.h", 3: "tmmintrin.h", 4: "smmintrin.h", 5: "ammintrin.h"}[SSE]
+$assert MR <= 4
 #include <assert.h>
 
 $if SSE == 5:
diff --git a/src/qs8-igemm/MRx4c8-minmax-sse.c.in b/src/qs8-igemm/MRx4c8-minmax-sse.c.in
index 18edf63..b512894 100644
--- a/src/qs8-igemm/MRx4c8-minmax-sse.c.in
+++ b/src/qs8-igemm/MRx4c8-minmax-sse.c.in
@@ -4,6 +4,7 @@
 // LICENSE file in the root directory of this source tree.
 
 $SSE_HEADER = {2: "emmintrin.h", 3: "tmmintrin.h", 4: "smmintrin.h", 5: "ammintrin.h"}[SSE]
+$assert MR <= 4
 #include <assert.h>
 
 $if SSE == 5:
@@ -38,7 +39,9 @@
   assert(mr <= ${MR});
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (${MR} * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
@@ -224,7 +227,7 @@
     $for M in range(0, MR, 2):
       vacc${M}${min(M+1, MR-1)}x0123 = _mm_min_epi16(_mm_max_epi16(vacc${M}${min(M+1, MR-1)}x0123, voutput_min), voutput_max);
 
-    $if M > 2:
+    $if MR > 2:
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc${min(2, MR-1)}${min(3, MR-1)}x0123);
     $else:
       __m128i vout = _mm_packs_epi16(vacc0${min(1, MR-1)}x0123, vacc0${min(1, MR-1)}x0123);
diff --git a/src/qs8-igemm/MRx8c8-minmax-avx2.c.in b/src/qs8-igemm/MRx8c8-minmax-avx2.c.in
new file mode 100644
index 0000000..9cd6d70
--- /dev/null
+++ b/src/qs8-igemm/MRx8c8-minmax-avx2.c.in
@@ -0,0 +1,219 @@
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert MR <= 4
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_${MR}x8c8__avx2(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= ${MR});
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (${MR} * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  $for M in range(1, MR):
+    int8_t* c${M} = (int8_t*) ((uintptr_t) c${M-1} + cm_stride);
+    $if M % 2 == 0:
+      if XNN_UNPREDICTABLE(mr <= ${M}) {
+        c${M} = c${M-1};
+      }
+    $elif M + 1 == MR:
+      if XNN_UNPREDICTABLE(mr != ${M+1}) {
+        c${M} = c${M-1};
+      }
+    $else:
+      if XNN_UNPREDICTABLE(mr < ${M+1}) {
+        c${M} = c${M-1};
+      }
+
+  do {
+    const __m128i vbias0x0 = _mm_loadu_si32(w);
+    const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+    __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+    $for N in range(2, 8, 2):
+      const __m128i vbias0x${N} = _mm_loadu_si32((const void*) ((uintptr_t) w + ${N} * sizeof(int32_t)));
+      const __m128i vbias0x${N+1} = _mm_loadu_si32((const void*) ((uintptr_t) w + ${N+1} * sizeof(int32_t)));
+      __m256i vacc0x${N}${N+1} = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x${N}), vbias0x${N+1}, 1);
+    $for M in range(1, MR):
+      $for N in range(0, 8, 2):
+        __m256i vacc${M}x${N}${N+1} = vacc0x${N}${N+1};
+    w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      $for M in range(MR):
+        const int8_t* restrict a${M} = a[${M}];
+        if XNN_UNPREDICTABLE(a${M} != zero) {
+          a${M} = (const int8_t*) ((uintptr_t) a${M} + a_offset);
+        }
+      a += ${MR};
+
+      size_t k = 0;
+      while (k < kc) {
+        $for M in range(MR):
+          const __m128i va${M} = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a${M}));
+          const __m256i vxa${M} = _mm256_cvtepi8_epi16(va${M});
+          a${M} += 8;
+
+        $for N in range(0, 8, 2):
+          $if N == 0:
+            const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) w);
+          $else:
+            const __m128i vb${N}${N+1} = _mm_load_si128((const __m128i*) ((uintptr_t) w + ${N * 8} * sizeof(int8_t)));
+          const __m256i vxb${N}${N+1} = _mm256_cvtepi8_epi16(vb${N}${N+1});
+
+          $for M in range(MR):
+            vacc${M}x${N}${N+1} = _mm256_add_epi32(vacc${M}x${N}${N+1}, _mm256_madd_epi16(vxa${M}, vxb${N}${N+1}));
+
+        w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+        k += 8 * sizeof(int8_t);
+      }
+      p -= ${MR} * sizeof(void*);
+    } while (p != 0);
+
+    $for M in range(MR):
+      const __m256i vacc${M}x0213 = _mm256_hadd_epi32(vacc${M}x01, vacc${M}x23);
+      const __m256i vacc${M}x4657 = _mm256_hadd_epi32(vacc${M}x45, vacc${M}x67);
+
+    $for M in range(MR):
+      const __m256i vacc${M}x02461357 = _mm256_hadd_epi32(vacc${M}x0213, vacc${M}x4657);
+
+    const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    $for M in range(MR):
+      __m256i vacc${M}x01234567 = _mm256_permutevar8x32_epi32(vacc${M}x02461357, vpermuate_mask);
+
+    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+    $for M in range(MR):
+      const __m256i vacc${M}x23016745 = _mm256_shuffle_epi32(vacc${M}x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+    $for M in range(MR):
+      const __m256i vprod${M}x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc${M}x01234567, vmultiplier), vrounding);
+
+    $for M in range(MR):
+      const __m256i vprod${M}x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc${M}x23016745, vmultiplier), vrounding);
+
+    $for M in range(MR):
+      const __m256i vq31prod${M}x0246 = _mm256_srli_epi64(vprod${M}x0246, 31);
+      const __m256i vq31prod${M}x1357 = _mm256_add_epi64(vprod${M}x1357, vprod${M}x1357);
+
+    $for M in range(MR):
+      const __m256i vq31prod${M}x01234567 = _mm256_blend_epi16(vq31prod${M}x0246, vq31prod${M}x1357, 0xCC);
+
+    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    $for M in range(MR):
+      const __m256i vrem${M}x01234567 =
+        _mm256_add_epi32(_mm256_and_si256(vq31prod${M}x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod${M}x01234567));
+
+    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    $for M in range(MR):
+      vacc${M}x01234567 =
+        _mm256_sub_epi32(_mm256_sra_epi32(vq31prod${M}x01234567, vshift), _mm256_cmpgt_epi32(vrem${M}x01234567, vremainder_threshold));
+
+    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    $for M in range(0, MR, 2):
+      __m256i vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc${M}x01234567, vacc${min(M+1, MR-1)}x01234567), voutput_zero_point);
+
+    $for M in range(0, MR, 2):
+      vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_permute4x64_epi64(vacc${M}${min(M+1, MR-1)}x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+    $for M in range(0, MR, 2):
+      vacc${M}${min(M+1, MR-1)}x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc${M}${min(M+1, MR-1)}x01234567, voutput_min), voutput_max);
+
+    $if MR > 2:
+      __m256i vout = _mm256_packs_epi16(vacc0${min(1, MR-1)}x01234567, vacc${min(2, MR-1)}${min(3, MR-1)}x01234567);
+    $else:
+      __m256i vout = _mm256_packs_epi16(vacc0${min(1, MR-1)}x01234567, vacc0${min(1, MR-1)}x01234567);
+    __m128i vout_lo = _mm256_castsi256_si128(vout);
+    __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+    if (nc >= 8) {
+      $if MR > 3:
+        _mm_storeh_pi((__m64*) c3, _mm_castsi128_ps(vout_hi));
+      $if MR > 2:
+        _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
+      $if MR > 1:
+        _mm_storel_epi64((__m128i*) c1, vout_hi);
+      _mm_storel_epi64((__m128i*) c0, vout_lo);
+
+      $for M in reversed(range(MR)):
+        c${M} = (int8_t*) ((uintptr_t) c${M} + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        $if MR > 3:
+          *((uint32_t*) c3) = (uint32_t) _mm_extract_epi32(vout_hi, 2);
+        $if MR > 2:
+          *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
+        $if MR > 1:
+          _mm_storeu_si32(c1, vout_hi);
+        _mm_storeu_si32(c0, vout_lo);
+
+        $for M in reversed(range(MR)):
+          c${M} += 4;
+
+        vout_lo = _mm_srli_epi64(vout_lo, 32);
+        vout_hi = _mm_srli_epi64(vout_hi, 32);
+      }
+      if (nc & 2) {
+        $if MR > 3:
+          *((uint16_t*) c3) = (uint16_t) _mm_extract_epi16(vout_hi, 4);
+        $if MR > 2:
+          *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
+        $if MR > 1:
+          *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+
+        $for M in reversed(range(MR)):
+          c${M} += 2;
+
+        vout_lo = _mm_srli_epi32(vout_lo, 16);
+        vout_hi = _mm_srli_epi32(vout_hi, 16);
+      }
+      if (nc & 1) {
+        $if MR > 3:
+          *c3 = (uint8_t) _mm_extract_epi8(vout_hi, 8);
+        $if MR > 2:
+          *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
+        $if MR > 1:
+          *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
index 8d944c1..ebd336f 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld128.c
@@ -32,7 +32,9 @@
   assert(mr <= 1);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
index d2e0d7c..2c0b506 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse2-ld64.c
@@ -32,7 +32,9 @@
   assert(mr <= 1);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
index e2d9b1a..0cc2ea1 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld128.c
@@ -32,7 +32,9 @@
   assert(mr <= 1);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
index 436269c..7361bf5 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-sse41-ld64.c
@@ -32,7 +32,9 @@
   assert(mr <= 1);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
index 3e18267..66122ee 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld128.c
@@ -32,7 +32,9 @@
   assert(mr <= 1);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
index dab68bf..393becc 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-ssse3-ld64.c
@@ -32,7 +32,9 @@
   assert(mr <= 1);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
index 7f76f76..8f78a11 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld128.c
@@ -37,7 +37,9 @@
   assert(mr <= 1);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
index 8163689..8d4fd13 100644
--- a/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/1x4c8-minmax-xop-ld64.c
@@ -37,7 +37,9 @@
   assert(mr <= 1);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/1x8c8-minmax-avx2.c b/src/qs8-igemm/gen/1x8c8-minmax-avx2.c
new file mode 100644
index 0000000..fe48dce
--- /dev/null
+++ b/src/qs8-igemm/gen/1x8c8-minmax-avx2.c
@@ -0,0 +1,172 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx8c8-minmax-avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 1);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (1 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+
+  do {
+    const __m128i vbias0x0 = _mm_loadu_si32(w);
+    const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+    __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+    const __m128i vbias0x2 = _mm_loadu_si32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t)));
+    const __m128i vbias0x3 = _mm_loadu_si32((const void*) ((uintptr_t) w + 3 * sizeof(int32_t)));
+    __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
+    const __m128i vbias0x4 = _mm_loadu_si32((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
+    const __m128i vbias0x5 = _mm_loadu_si32((const void*) ((uintptr_t) w + 5 * sizeof(int32_t)));
+    __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
+    const __m128i vbias0x6 = _mm_loadu_si32((const void*) ((uintptr_t) w + 6 * sizeof(int32_t)));
+    const __m128i vbias0x7 = _mm_loadu_si32((const void*) ((uintptr_t) w + 7 * sizeof(int32_t)));
+    __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
+    w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      a += 1;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
+        const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
+        a0 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
+
+        vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+        const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
+
+        vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
+        const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t)));
+        const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
+
+        vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
+        const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t)));
+        const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
+
+        vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
+
+        w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 1 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
+    const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
+
+    const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
+
+    const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermuate_mask);
+
+    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+    const __m256i vacc0x23016745 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+    const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
+
+    const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x23016745, vmultiplier), vrounding);
+
+    const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
+    const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
+
+    const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
+
+    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vrem0x01234567 =
+      _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
+
+    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    vacc0x01234567 =
+      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
+
+    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    __m256i vacc00x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc0x01234567), voutput_zero_point);
+
+    vacc00x01234567 = _mm256_permute4x64_epi64(vacc00x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+    vacc00x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc00x01234567, voutput_min), voutput_max);
+
+    __m256i vout = _mm256_packs_epi16(vacc00x01234567, vacc00x01234567);
+    __m128i vout_lo = _mm256_castsi256_si128(vout);
+    __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+    if (nc >= 8) {
+      _mm_storel_epi64((__m128i*) c0, vout_lo);
+
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        _mm_storeu_si32(c0, vout_lo);
+
+        c0 += 4;
+
+        vout_lo = _mm_srli_epi64(vout_lo, 32);
+        vout_hi = _mm_srli_epi64(vout_hi, 32);
+      }
+      if (nc & 2) {
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+
+        c0 += 2;
+
+        vout_lo = _mm_srli_epi32(vout_lo, 16);
+        vout_hi = _mm_srli_epi32(vout_hi, 16);
+      }
+      if (nc & 1) {
+        *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
index a339331..7db2361 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld128.c
@@ -32,7 +32,9 @@
   assert(mr <= 2);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
index 1d66dc9..becb16b 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse2-ld64.c
@@ -32,7 +32,9 @@
   assert(mr <= 2);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
index 24f3ffe..c63c8ae 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld128.c
@@ -32,7 +32,9 @@
   assert(mr <= 2);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
index 2dee200..b7fd902 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-sse41-ld64.c
@@ -32,7 +32,9 @@
   assert(mr <= 2);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
index d26977b..247616b 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld128.c
@@ -32,7 +32,9 @@
   assert(mr <= 2);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
index e04b9af..fdf04cd 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-ssse3-ld64.c
@@ -32,7 +32,9 @@
   assert(mr <= 2);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
index 8f052c9..7f5a4f5 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld128.c
@@ -37,7 +37,9 @@
   assert(mr <= 2);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
index 6b7a9b7..74812a7 100644
--- a/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
+++ b/src/qs8-igemm/gen/2x4c8-minmax-xop-ld64.c
@@ -37,7 +37,9 @@
   assert(mr <= 2);
   assert(nc != 0);
   assert(kc != 0);
-  assert(kc % sizeof(int8_t) == 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
   assert(c != NULL);
diff --git a/src/qs8-igemm/gen/2x8c8-minmax-avx2.c b/src/qs8-igemm/gen/2x8c8-minmax-avx2.c
new file mode 100644
index 0000000..2a1cffa
--- /dev/null
+++ b/src/qs8-igemm/gen/2x8c8-minmax-avx2.c
@@ -0,0 +1,212 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx8c8-minmax-avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 2);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (2 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 2) {
+    c1 = c0;
+  }
+
+  do {
+    const __m128i vbias0x0 = _mm_loadu_si32(w);
+    const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+    __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+    const __m128i vbias0x2 = _mm_loadu_si32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t)));
+    const __m128i vbias0x3 = _mm_loadu_si32((const void*) ((uintptr_t) w + 3 * sizeof(int32_t)));
+    __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
+    const __m128i vbias0x4 = _mm_loadu_si32((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
+    const __m128i vbias0x5 = _mm_loadu_si32((const void*) ((uintptr_t) w + 5 * sizeof(int32_t)));
+    __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
+    const __m128i vbias0x6 = _mm_loadu_si32((const void*) ((uintptr_t) w + 6 * sizeof(int32_t)));
+    const __m128i vbias0x7 = _mm_loadu_si32((const void*) ((uintptr_t) w + 7 * sizeof(int32_t)));
+    __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
+    __m256i vacc1x01 = vacc0x01;
+    __m256i vacc1x23 = vacc0x23;
+    __m256i vacc1x45 = vacc0x45;
+    __m256i vacc1x67 = vacc0x67;
+    w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      a += 2;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
+        const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
+        const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
+        a1 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
+
+        vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
+        vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+        const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
+
+        vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
+        vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
+        const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t)));
+        const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
+
+        vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
+        vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
+        const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t)));
+        const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
+
+        vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
+        vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
+
+        w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 2 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
+    const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
+    const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
+    const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
+
+    const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
+    const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
+
+    const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermuate_mask);
+    __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermuate_mask);
+
+    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+    const __m256i vacc0x23016745 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m256i vacc1x23016745 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+    const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
+    const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
+
+    const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x23016745, vmultiplier), vrounding);
+    const __m256i vprod1x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x23016745, vmultiplier), vrounding);
+
+    const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
+    const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
+    const __m256i vq31prod1x0246 = _mm256_srli_epi64(vprod1x0246, 31);
+    const __m256i vq31prod1x1357 = _mm256_add_epi64(vprod1x1357, vprod1x1357);
+
+    const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
+    const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
+
+    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vrem0x01234567 =
+      _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
+    const __m256i vrem1x01234567 =
+      _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
+
+    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    vacc0x01234567 =
+      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
+    vacc1x01234567 =
+      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
+
+    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
+
+    vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+    vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
+
+    __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc01x01234567);
+    __m128i vout_lo = _mm256_castsi256_si128(vout);
+    __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+    if (nc >= 8) {
+      _mm_storel_epi64((__m128i*) c1, vout_hi);
+      _mm_storel_epi64((__m128i*) c0, vout_lo);
+
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        _mm_storeu_si32(c1, vout_hi);
+        _mm_storeu_si32(c0, vout_lo);
+
+        c1 += 4;
+        c0 += 4;
+
+        vout_lo = _mm_srli_epi64(vout_lo, 32);
+        vout_hi = _mm_srli_epi64(vout_hi, 32);
+      }
+      if (nc & 2) {
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+
+        c1 += 2;
+        c0 += 2;
+
+        vout_lo = _mm_srli_epi32(vout_lo, 16);
+        vout_hi = _mm_srli_epi32(vout_hi, 16);
+      }
+      if (nc & 1) {
+        *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/qs8-igemm/gen/3x8c8-minmax-avx2.c b/src/qs8-igemm/gen/3x8c8-minmax-avx2.c
new file mode 100644
index 0000000..7cce630
--- /dev/null
+++ b/src/qs8-igemm/gen/3x8c8-minmax-avx2.c
@@ -0,0 +1,255 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-igemm/MRx8c8-minmax-avx2.c.in
+//   Generator: tools/xngen
+//
+// Copyright 2020 Google LLC
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <immintrin.h>
+
+#include <xnnpack/igemm.h>
+
+
+void xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2(
+    size_t mr,
+    size_t nc,
+    size_t kc,
+    size_t ks,
+    const int8_t** restrict a,
+    const void* restrict w,
+    int8_t* restrict c,
+    size_t cm_stride,
+    size_t cn_stride,
+    size_t a_offset,
+    const int8_t* zero,
+    const union xnn_qs8_gemm_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
+{
+  assert(mr != 0);
+  assert(mr <= 3);
+  assert(nc != 0);
+  assert(kc != 0);
+  assert(ks != 0);
+  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(a_offset % sizeof(int8_t) == 0);
+  assert(a != NULL);
+  assert(w != NULL);
+  assert(c != NULL);
+
+  int8_t* c0 = c;
+  int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
+  if XNN_UNPREDICTABLE(mr < 2) {
+    c1 = c0;
+  }
+  int8_t* c2 = (int8_t*) ((uintptr_t) c1 + cm_stride);
+  if XNN_UNPREDICTABLE(mr <= 2) {
+    c2 = c1;
+  }
+
+  do {
+    const __m128i vbias0x0 = _mm_loadu_si32(w);
+    const __m128i vbias0x1 = _mm_loadu_si32((const void*) ((uintptr_t) w + sizeof(int32_t)));
+    __m256i vacc0x01 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x0), vbias0x1, 1);
+    const __m128i vbias0x2 = _mm_loadu_si32((const void*) ((uintptr_t) w + 2 * sizeof(int32_t)));
+    const __m128i vbias0x3 = _mm_loadu_si32((const void*) ((uintptr_t) w + 3 * sizeof(int32_t)));
+    __m256i vacc0x23 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x2), vbias0x3, 1);
+    const __m128i vbias0x4 = _mm_loadu_si32((const void*) ((uintptr_t) w + 4 * sizeof(int32_t)));
+    const __m128i vbias0x5 = _mm_loadu_si32((const void*) ((uintptr_t) w + 5 * sizeof(int32_t)));
+    __m256i vacc0x45 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x4), vbias0x5, 1);
+    const __m128i vbias0x6 = _mm_loadu_si32((const void*) ((uintptr_t) w + 6 * sizeof(int32_t)));
+    const __m128i vbias0x7 = _mm_loadu_si32((const void*) ((uintptr_t) w + 7 * sizeof(int32_t)));
+    __m256i vacc0x67 = _mm256_inserti128_si256(_mm256_castsi128_si256(vbias0x6), vbias0x7, 1);
+    __m256i vacc1x01 = vacc0x01;
+    __m256i vacc1x23 = vacc0x23;
+    __m256i vacc1x45 = vacc0x45;
+    __m256i vacc1x67 = vacc0x67;
+    __m256i vacc2x01 = vacc0x01;
+    __m256i vacc2x23 = vacc0x23;
+    __m256i vacc2x45 = vacc0x45;
+    __m256i vacc2x67 = vacc0x67;
+    w = (const void*) ((uintptr_t) w + 8 * sizeof(int32_t));
+
+    size_t p = ks;
+    do {
+      const int8_t* restrict a0 = a[0];
+      if XNN_UNPREDICTABLE(a0 != zero) {
+        a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
+      }
+      const int8_t* restrict a1 = a[1];
+      if XNN_UNPREDICTABLE(a1 != zero) {
+        a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
+      }
+      const int8_t* restrict a2 = a[2];
+      if XNN_UNPREDICTABLE(a2 != zero) {
+        a2 = (const int8_t*) ((uintptr_t) a2 + a_offset);
+      }
+      a += 3;
+
+      size_t k = 0;
+      while (k < kc) {
+        const __m128i va0 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a0));
+        const __m256i vxa0 = _mm256_cvtepi8_epi16(va0);
+        a0 += 8;
+        const __m128i va1 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a1));
+        const __m256i vxa1 = _mm256_cvtepi8_epi16(va1);
+        a1 += 8;
+        const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
+        const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
+        a2 += 8;
+
+        const __m128i vb01 = _mm_load_si128((const __m128i*) w);
+        const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
+
+        vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
+        vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
+        vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
+        const __m128i vb23 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 16 * sizeof(int8_t)));
+        const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
+
+        vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
+        vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
+        vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
+        const __m128i vb45 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 32 * sizeof(int8_t)));
+        const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
+
+        vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
+        vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
+        vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
+        const __m128i vb67 = _mm_load_si128((const __m128i*) ((uintptr_t) w + 48 * sizeof(int8_t)));
+        const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
+
+        vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
+        vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
+        vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
+
+        w = (const void*) ((uintptr_t) w + 64 * sizeof(int8_t));
+        k += 8 * sizeof(int8_t);
+      }
+      p -= 3 * sizeof(void*);
+    } while (p != 0);
+
+    const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
+    const __m256i vacc0x4657 = _mm256_hadd_epi32(vacc0x45, vacc0x67);
+    const __m256i vacc1x0213 = _mm256_hadd_epi32(vacc1x01, vacc1x23);
+    const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
+    const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
+    const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
+
+    const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
+    const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
+    const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
+
+    const __m256i vpermuate_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+    __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermuate_mask);
+    __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermuate_mask);
+    __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermuate_mask);
+
+    const __m256i vmultiplier = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.multiplier));
+    const __m256i vrounding = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.rounding));
+
+    const __m256i vacc0x23016745 = _mm256_shuffle_epi32(vacc0x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m256i vacc1x23016745 = _mm256_shuffle_epi32(vacc1x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+    const __m256i vacc2x23016745 = _mm256_shuffle_epi32(vacc2x01234567, _MM_SHUFFLE(2, 3, 0, 1));
+
+    const __m256i vprod0x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x01234567, vmultiplier), vrounding);
+    const __m256i vprod1x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x01234567, vmultiplier), vrounding);
+    const __m256i vprod2x0246 = _mm256_add_epi64(_mm256_mul_epi32(vacc2x01234567, vmultiplier), vrounding);
+
+    const __m256i vprod0x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc0x23016745, vmultiplier), vrounding);
+    const __m256i vprod1x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc1x23016745, vmultiplier), vrounding);
+    const __m256i vprod2x1357 = _mm256_add_epi64(_mm256_mul_epi32(vacc2x23016745, vmultiplier), vrounding);
+
+    const __m256i vq31prod0x0246 = _mm256_srli_epi64(vprod0x0246, 31);
+    const __m256i vq31prod0x1357 = _mm256_add_epi64(vprod0x1357, vprod0x1357);
+    const __m256i vq31prod1x0246 = _mm256_srli_epi64(vprod1x0246, 31);
+    const __m256i vq31prod1x1357 = _mm256_add_epi64(vprod1x1357, vprod1x1357);
+    const __m256i vq31prod2x0246 = _mm256_srli_epi64(vprod2x0246, 31);
+    const __m256i vq31prod2x1357 = _mm256_add_epi64(vprod2x1357, vprod2x1357);
+
+    const __m256i vq31prod0x01234567 = _mm256_blend_epi16(vq31prod0x0246, vq31prod0x1357, 0xCC);
+    const __m256i vq31prod1x01234567 = _mm256_blend_epi16(vq31prod1x0246, vq31prod1x1357, 0xCC);
+    const __m256i vq31prod2x01234567 = _mm256_blend_epi16(vq31prod2x0246, vq31prod2x1357, 0xCC);
+
+    const __m256i vremainder_mask = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_mask));
+    const __m256i vrem0x01234567 =
+      _mm256_add_epi32(_mm256_and_si256(vq31prod0x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod0x01234567));
+    const __m256i vrem1x01234567 =
+      _mm256_add_epi32(_mm256_and_si256(vq31prod1x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod1x01234567));
+    const __m256i vrem2x01234567 =
+      _mm256_add_epi32(_mm256_and_si256(vq31prod2x01234567, vremainder_mask), _mm256_cmpgt_epi32(_mm256_setzero_si256(), vq31prod2x01234567));
+
+    const __m256i vremainder_threshold = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.remainder_threshold));
+    const __m128i vshift = _mm_load_si128((const __m128i*) params->sse2.shift);
+    vacc0x01234567 =
+      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod0x01234567, vshift), _mm256_cmpgt_epi32(vrem0x01234567, vremainder_threshold));
+    vacc1x01234567 =
+      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod1x01234567, vshift), _mm256_cmpgt_epi32(vrem1x01234567, vremainder_threshold));
+    vacc2x01234567 =
+      _mm256_sub_epi32(_mm256_sra_epi32(vq31prod2x01234567, vshift), _mm256_cmpgt_epi32(vrem2x01234567, vremainder_threshold));
+
+    const __m256i voutput_zero_point = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_zero_point));
+    __m256i vacc01x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc0x01234567, vacc1x01234567), voutput_zero_point);
+    __m256i vacc22x01234567 = _mm256_adds_epi16(_mm256_packs_epi32(vacc2x01234567, vacc2x01234567), voutput_zero_point);
+
+    vacc01x01234567 = _mm256_permute4x64_epi64(vacc01x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+    vacc22x01234567 = _mm256_permute4x64_epi64(vacc22x01234567, _MM_SHUFFLE(3, 1, 2, 0));
+
+    const __m256i voutput_min = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_min));
+    const __m256i voutput_max = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*) params->sse2.output_max));
+    vacc01x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc01x01234567, voutput_min), voutput_max);
+    vacc22x01234567 = _mm256_min_epi16(_mm256_max_epi16(vacc22x01234567, voutput_min), voutput_max);
+
+    __m256i vout = _mm256_packs_epi16(vacc01x01234567, vacc22x01234567);
+    __m128i vout_lo = _mm256_castsi256_si128(vout);
+    __m128i vout_hi = _mm256_extracti128_si256(vout, 1);
+
+    if (nc >= 8) {
+      _mm_storeh_pi((__m64*) c2, _mm_castsi128_ps(vout_lo));
+      _mm_storel_epi64((__m128i*) c1, vout_hi);
+      _mm_storel_epi64((__m128i*) c0, vout_lo);
+
+      c2 = (int8_t*) ((uintptr_t) c2 + cn_stride);
+      c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
+      c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
+
+      a = (const int8_t**restrict) ((uintptr_t) a - ks);
+
+      nc -= 8;
+    } else {
+      if (nc & 4) {
+        *((uint32_t*) c2) = (uint32_t) _mm_extract_epi32(vout_lo, 2);
+        _mm_storeu_si32(c1, vout_hi);
+        _mm_storeu_si32(c0, vout_lo);
+
+        c2 += 4;
+        c1 += 4;
+        c0 += 4;
+
+        vout_lo = _mm_srli_epi64(vout_lo, 32);
+        vout_hi = _mm_srli_epi64(vout_hi, 32);
+      }
+      if (nc & 2) {
+        *((uint16_t*) c2) = (uint16_t) _mm_extract_epi16(vout_lo, 4);
+        *((uint16_t*) c1) = (uint16_t) _mm_extract_epi16(vout_hi, 0);
+        *((uint16_t*) c0) = (uint16_t) _mm_extract_epi16(vout_lo, 0);
+
+        c2 += 2;
+        c1 += 2;
+        c0 += 2;
+
+        vout_lo = _mm_srli_epi32(vout_lo, 16);
+        vout_hi = _mm_srli_epi32(vout_hi, 16);
+      }
+      if (nc & 1) {
+        *c2 = (uint8_t) _mm_extract_epi8(vout_lo, 8);
+        *c1 = (uint8_t) _mm_extract_epi8(vout_hi, 0);
+        *c0 = (int8_t) _mm_extract_epi8(vout_lo, 0);
+      }
+
+      nc = 0;
+    }
+  } while (nc != 0);
+}
diff --git a/src/xnnpack/gemm.h b/src/xnnpack/gemm.h
index f4a876d..ecd0b7d 100644
--- a/src/xnnpack/gemm.h
+++ b/src/xnnpack/gemm.h
@@ -549,6 +549,10 @@
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x4c8__xop_ld128)
 DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x4c8__xop_ld128)
 
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_1x8c8__avx2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_2x8c8__avx2)
+DECLARE_QS8_GEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_gemm_minmax_ukernel_3x8c8__avx2)
+
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/src/xnnpack/igemm.h b/src/xnnpack/igemm.h
index 590fcae..cf262ee 100644
--- a/src/xnnpack/igemm.h
+++ b/src/xnnpack/igemm.h
@@ -364,6 +364,10 @@
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x4c8__xop_ld128)
 DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x4c8__xop_ld128)
 
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_1x8c8__avx2)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_2x8c8__avx2)
+DECLARE_QS8_IGEMM_MINMAX_UKERNEL_FUNCTION(xnn_qs8_igemm_minmax_ukernel_3x8c8__avx2)
+
 
 #ifdef __cplusplus
 }  // extern "C"